Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,10 @@ jobs:
coverage run -a --source=src/lenskit -m lenskit data fetch -D data/az18 --amazon --edition 2018 Video_Games
coverage run -a --source=src/lenskit -m lenskit data fetch -D data/az14 --amazon --edition 2014 Video_Games
coverage run -a --source=src/lenskit -m lenskit data fetch --ms-web
if [[ ! -f data/australian_users_items.json.gz ]]; then
curl -fsL -o data/australian_users_items.json.gz.tmp https://mcauleylab.ucsd.edu/public_datasets/data/steam/australian_users_items.json.gz
mv data/australian_users_items.json.gz.tmp data/australian_users_items.json.gz
fi

- name: Run Eval Tests
run: |
Expand Down
28 changes: 28 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rand_pcg = "^0.10"
rustc-hash = "^2.1.1"
sha1 = "^0.11"

peg = "^0.8"
serde = { version = "^1.0", features = ["derive"] }
serde_json = "^1.0"

Expand Down
2 changes: 2 additions & 0 deletions src/accel/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod cooc;
mod coordinates;
mod index;
mod pairs;
mod pyon;
mod sampling;
mod scatter;
mod selection;
Expand Down Expand Up @@ -48,6 +49,7 @@ pub fn register_data(parent: &Bound<'_, PyModule>) -> PyResult<()> {
data.add_function(wrap_pyfunction!(cooc::count_cooc, &data)?)?;
data.add_function(wrap_pyfunction!(cooc::dense_cooc, &data)?)?;
data.add_function(wrap_pyfunction!(hash_array, &data)?)?;
data.add_function(wrap_pyfunction!(pyon::pyon_loads, &data)?)?;

Ok(())
}
Expand Down
146 changes: 146 additions & 0 deletions src/accel/data/pyon.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// This file is part of LensKit.
// Copyright (C) 2018-2023 Boise State University.
// Copyright (C) 2023-2026 Drexel University.
// Licensed under the MIT license, see LICENSE.md for details.
// SPDX-License-Identifier: MIT

//! Support for reading invalid JSON that is actually valid Python expression syntax.

use std::borrow::Cow;

use log::*;
use pyo3::{
exceptions::PyValueError,
prelude::*,
types::{PyBool, PyDict, PyFloat, PyInt, PyList, PyNone, PyString},
};

use serde_json::{Number, Value};

/// Parse a “pyson” object.
#[pyfunction]
pub fn pyon_loads<'py>(py: Python<'py>, text: &str) -> PyResult<Bound<'py, PyAny>> {
match pyon_parser::expr(text) {
Ok(ast) => {
let obj = realize_value(py, ast)?;
Ok(obj)
}
Err(e) => {
error!(
"parse error at {}:{}: found {}, expected {:}",
e.location.line,
e.location.column,
text.chars()
.nth(e.location.offset)
.map(|c| format!("“{}”", c))
.unwrap_or("<EOF>".into()),
e.expected
);
Err(PyValueError::new_err(format!("parse error: {:}", e)))
}
}
}

peg::parser! {
grammar pyon_parser() for str {
rule _ = quiet!{[' ' | '\n' | '\t' | '\r' | '\n']*}

rule digit() = ['0'..='9']
rule digits() -> &'input str
= $(digit()+)

rule number() -> Value
= e:$("-"? digits() "." digits()) {Value::Number(Number::from_f64(e.parse::<f64>().unwrap()).unwrap())}
/ e:$("-" digits()) {Value::Number(e.parse::<i64>().unwrap().into())}
/ e:digits() {Value::Number(e.parse::<u64>().unwrap().into())}

rule boolean() -> Value
= ("true" / "True") { Value::Bool(true)}
/ ("false" / "False") { Value::Bool(false)}

rule null() -> Value
= ("null" / "None") { Value::Null}

rule _string() -> String
= "b"? "'" parts:((_str_part_sq() / echar())*) "'" {parts.into_iter().collect()}
/ "b"? "\"" parts:((_str_part_dq() / echar())*) "\"" {parts.into_iter().collect()}

rule _str_part_sq() -> Cow<'input, str>
= s:$([^'\'' | '\\']+) {Cow::Borrowed(s)}

rule _str_part_dq() -> Cow<'input, str>
= s:$([^'"' | '\\']+) {Cow::Borrowed(s)}

rule string() -> Value
= s:_string() {Value::String(s)}

rule echar() -> Cow<'input, str>
= "\\t" {"\t".into()}
/ "\\r" {"\r".into()}
/ "\\n" {"\n".into()}
/ "\\u" x:$(['a'..='f'| 'A'..='F'| '0'..='9']*<4,4>) {
let s = format!("0x{}", x);
let c = u32::from_str_radix(x, 16).expect("invalid hex");
String::from_iter([char::from_u32(c).expect("invalid cahracter")]).into()
}
/ "\\U" x:$(['a'..='f'| 'A'..='F'| '0'..='9']*<8,8>) {
let s = format!("0x{}", x);
let c = u32::from_str_radix(x, 16).expect("invalid hex");
String::from_iter([char::from_u32(c).expect("invalid cahracter")]).into()
}
/ "\\" c:$([_]) {c.into()}

rule list() -> Value
= "[" e:(expr() ** ",") _ ","? "]" {Value::Array(e)}

rule object() -> Value
= "{" entries:(object_entry() ** ",") _ ","? "}" {Value::Object(entries.into_iter().collect())}

rule object_entry() -> (String, Value)
= _ k:_string() _ ":" v:expr() {(k, v)}

rule _expr() -> Value
= null()
/ boolean()
/ number()
/ string()
/ list()
/ object()

pub rule expr() -> Value = _ e:_expr() _ {e}
}
}

fn realize_value<'py>(py: Python<'py>, ast: Value) -> PyResult<Bound<'py, PyAny>> {
match ast {
Value::Null => Ok(PyNone::get(py).to_owned().into_any()),
Value::Bool(val) => Ok(PyBool::new(py, val).to_owned().into_any()),
Value::Number(n) => {
if let Some(i) = n.as_i64() {
Ok(PyInt::new(py, i).into_any())
} else if let Some(x) = n.as_f64() {
Ok(PyFloat::new(py, x).into_any())
} else {
Err(PyValueError::new_err(format!("invalid number {:?}", n)))
}
}
Value::String(s) => Ok(PyString::new(py, &s).into_any()),
Value::Array(list) => {
let out = PyList::empty(py);
for elt in list {
out.append(realize_value(py, elt)?)?;
}

Ok(out.into_any())
}
Value::Object(dict) => {
let out = PyDict::new(py);

for (k, v) in dict {
out.set_item(k, realize_value(py, v)?)?;
}

Ok(out.into_any())
}
}
}
2 changes: 2 additions & 0 deletions src/lenskit/_accel/data.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ from __future__ import annotations

import numpy as np
import pyarrow as pa
from pydantic import JsonValue
from typing_extensions import TypeVar

from lenskit.data.matrix import SparseRowArray
Expand Down Expand Up @@ -52,6 +53,7 @@ def sample_negatives(
seed: int,
) -> np.ndarray[tuple[int, int], np.dtype[np.int32]]: ...
def hash_array(arr: pa.Array, /) -> str: ...
def pyon_loads(data: str) -> JsonValue: ...

class IDIndex:
"""
Expand Down
10 changes: 8 additions & 2 deletions src/lenskit/cli/data/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

import click

from lenskit.data import load_amazon_ratings, load_movielens
from lenskit.data.msweb import load_ms_web
from lenskit.logging import get_logger

_log = get_logger(__name__)
Expand All @@ -19,6 +17,7 @@
@click.option("--movielens", "format", flag_value="movielens", help="Convert MovieLens data.")
@click.option("--amazon", "format", flag_value="amazon", help="Convert Amazon rating data.")
@click.option("--ms-web", "format", flag_value="ms-web", help="Convert MSWeb visit logs.")
@click.option("--steam", "format", flag_value="steam", help="Convert Steam interaction data")
@click.option(
"--item-lists", is_flag=True, help="Convert to an ItemListCollection instead of Dataset."
)
Expand All @@ -40,13 +39,20 @@ def convert(format: str | None, src: list[Path], dst: Path, item_lists: bool = F
_log.error("no data format specified")
raise click.UsageError("no data format specified")
case "movielens":
from lenskit.data.sources.movielens import load_movielens

log.info("loading MovieLens data")
if len(src) != 1:
log.error("received %d source paths, MovieLens only takes one", len(src))

data = load_movielens(src[0])
case "amazon":
from lenskit.data.sources.amazon import load_amazon_ratings

data = load_amazon_ratings(*src)
case "ms-web":
from lenskit.data.sources.msweb import load_ms_web

data = load_ms_web(src[0])
case _:
raise ValueError(f"unknown data format {format}")
Expand Down
31 changes: 22 additions & 9 deletions src/lenskit/cli/data/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,48 @@
from rich.console import Console
from rich.markdown import Markdown

from lenskit.data import Dataset, load_amazon_ratings, load_movielens
from lenskit.data import Dataset
from lenskit.data._summary import save_stats
from lenskit.logging import get_logger

_log = get_logger(__name__)


@click.command("describe")
@click.option("--movielens", "format", flag_value="movielens", help="describe MovieLens data")
@click.option("--amazon", "format", flag_value="amazon", help="describe Amazon rating data")
@click.option("--movielens", "format", flag_value="movielens", help="Describe MovieLens data.")
@click.option("--amazon", "format", flag_value="amazon", help="Describe Amazon rating data.")
@click.option("--steam", "format", flag_value="steam", help="Describe Steam interaction data.")
@click.option("--markdown", is_flag=True, help="output raw Markdown")
@click.argument("path", type=Path)
def describe(format: str | None, markdown: bool, path: Path):
@click.argument("path", type=Path, nargs=-1, required=True)
def describe(format: str | None, markdown: bool, path: list[Path]):
"""
Describe a data set.
"""

log = _log.bind(path=str(path))
if len(path) == 1:
log = _log.bind(path=str(path[0]))
else:
log = _log.bind(path=[str(p) for p in path])

match format:
case None:
log.info("loading LensKit native data")
data = Dataset.load(path)
data = Dataset.load(path[0])
case "movielens":
from lenskit.data.sources.movielens import load_movielens

log.info("loading MovieLens data")
data = load_movielens(path)
data = load_movielens(path[0])
case "amazon":
from lenskit.data.sources.amazon import load_amazon_ratings

log.info("loading Amazon data")
data = load_amazon_ratings(path)
data = load_amazon_ratings(path[0])
case "steam":
from lenskit.data.sources.steam import load_steam

log.info("loading Steam data")
data = load_steam(*path)
case _:
raise ValueError(f"unknown data format {format}")

Expand Down
9 changes: 3 additions & 6 deletions src/lenskit/data/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,8 +828,7 @@ def add_scalar_attribute(
raise DataError(f"no entities of class {cls}")

nums = self._resolve_entity_ids(cls, entities, e_tbl)
if not np.all(nums.is_valid()): # pragma: nocover
n_bad = nums.is_valid().sum().as_py()
if n_bad := nums.null_count: # pragma: nocover
raise DataError(f"{n_bad} unknown entity IDs")

val_array: pa.Array = pa.array(values) # type: ignore
Expand Down Expand Up @@ -915,8 +914,7 @@ def add_list_attribute(
if e_tbl is None: # pragma: nocover
raise DataError(f"no entities of class {cls}")
nums = self._resolve_entity_ids(cls, entities, e_tbl)
if not np.all(nums.is_valid()): # pragma: nocover
n_bad = nums.is_valid().sum().as_py()
if n_bad := nums.null_count: # pragma: nocover
raise DataError(f"{n_bad} unknown entity IDs")

val_array: pa.Array = pa.array(values) # type: ignore
Expand Down Expand Up @@ -980,8 +978,7 @@ def add_vector_attribute(
if e_tbl is None: # pragma: nocover
raise DataError(f"no entities of class {cls}")
nums = self._resolve_entity_ids(cls, entities, e_tbl)
if not np.all(nums.is_valid()): # pragma: nocover
n_bad = nums.is_valid().sum().as_py()
if n_bad := nums.null_count: # pragma: nocover
raise DataError(f"{n_bad} unknown entity IDs")

tbl_valid = np.zeros(e_tbl.num_rows, dtype=np.bool_)
Expand Down
Loading
Loading