diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9c4e232bc..dff88e440 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -295,6 +295,10 @@ jobs: coverage run -a --source=src/lenskit -m lenskit data fetch -D data/az18 --amazon --edition 2018 Video_Games coverage run -a --source=src/lenskit -m lenskit data fetch -D data/az14 --amazon --edition 2014 Video_Games coverage run -a --source=src/lenskit -m lenskit data fetch --ms-web + if [[ ! -f data/australian_users_items.json.gz ]]; then + curl -fsL -o data/australian_users_items.json.gz.tmp https://mcauleylab.ucsd.edu/public_datasets/data/steam/australian_users_items.json.gz + mv data/australian_users_items.json.gz.tmp data/australian_users_items.json.gz + fi - name: Run Eval Tests run: | @@ -302,7 +306,7 @@ jobs: - name: ๐Ÿ•บ๐Ÿป Test LensKit CLI run: | - mise run test-cli -- --coverage --cov-append tests/cli/test-tune.sh + mise run test-cli -- --coverage --cov-append shell: bash -e {0} env: LK_TUNE_JOBS: 2 diff --git a/Cargo.lock b/Cargo.lock index 8c4b116d8..530b7ff82 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -644,6 +644,7 @@ dependencies = [ "ntest", "numpy", "ordered-float", + "peg", "pyo3", "rand", "rand_pcg", @@ -868,6 +869,33 @@ dependencies = [ "num-traits", ] +[[package]] +name = "peg" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aad070be5b63aa72103f2fcdd70a83adbd5e90112ce5b574171ff1c65501773" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd8ef6825cae95355031ae26a99b616a2a21f22ba2de0197c43dfb05acbe7ee" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7011d97b484a5ebdc4b1fdb3b12d5e4bbbea56e9d22b688f2e79e04b65a7d8a6" + [[package]] name = "pin-project-lite" version = "0.2.17" diff --git a/Cargo.toml b/Cargo.toml index 49a454844..cbe6d373a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ rand_pcg = "^0.10" rustc-hash = "^2.1.1" sha1 = "^0.11" +peg = "^0.8" serde = { version = "^1.0", features = ["derive"] } serde_json = "^1.0" diff --git a/docs/releases/2026.rst b/docs/releases/2026.rst index 339272157..8bd1ec1f9 100644 --- a/docs/releases/2026.rst +++ b/docs/releases/2026.rst @@ -17,6 +17,9 @@ rough corners polished off the interfaces, and hopefully fewer bugs. Third release in 2026, with more feature updates and bug fixes. +- Added initial support for importing Steam user/item data + (:py:mod:`lenskit.data.sources.steam`, :pr:`1111`). + .. _2026.2.1: 2026.2.1 diff --git a/src/accel/data/mod.rs b/src/accel/data/mod.rs index 11b46ec96..0957af7fb 100644 --- a/src/accel/data/mod.rs +++ b/src/accel/data/mod.rs @@ -20,6 +20,7 @@ mod cooc; mod coordinates; mod index; mod pairs; +mod pyon; mod sampling; mod scatter; mod selection; @@ -48,6 +49,7 @@ pub fn register_data(parent: &Bound<'_, PyModule>) -> PyResult<()> { data.add_function(wrap_pyfunction!(cooc::count_cooc, &data)?)?; data.add_function(wrap_pyfunction!(cooc::dense_cooc, &data)?)?; data.add_function(wrap_pyfunction!(hash_array, &data)?)?; + data.add_function(wrap_pyfunction!(pyon::pyon_loads, &data)?)?; Ok(()) } diff --git a/src/accel/data/pyon.rs b/src/accel/data/pyon.rs new file mode 100644 index 000000000..aac90d086 --- /dev/null +++ b/src/accel/data/pyon.rs @@ -0,0 +1,146 @@ +// This file is part of LensKit. +// Copyright (C) 2018-2023 Boise State University. +// Copyright (C) 2023-2026 Drexel University. +// Licensed under the MIT license, see LICENSE.md for details. +// SPDX-License-Identifier: MIT + +//! Support for reading invalid JSON that is actually valid Python expression syntax. + +use std::borrow::Cow; + +use log::*; +use pyo3::{ + exceptions::PyValueError, + prelude::*, + types::{PyBool, PyDict, PyFloat, PyInt, PyList, PyNone, PyString}, +}; + +use serde_json::{Number, Value}; + +/// Parse a โ€œpysonโ€ object. +#[pyfunction] +pub fn pyon_loads<'py>(py: Python<'py>, text: &str) -> PyResult> { + match pyon_parser::expr(text) { + Ok(ast) => { + let obj = realize_value(py, ast)?; + Ok(obj) + } + Err(e) => { + error!( + "parse error at {}:{}: found {}, expected {:}", + e.location.line, + e.location.column, + text.chars() + .nth(e.location.offset) + .map(|c| format!("โ€œ{}โ€", c)) + .unwrap_or("".into()), + e.expected + ); + Err(PyValueError::new_err(format!("parse error: {:}", e))) + } + } +} + +peg::parser! { + grammar pyon_parser() for str { + rule _ = quiet!{[' ' | '\n' | '\t' | '\r' | '\n']*} + + rule digit() = ['0'..='9'] + rule digits() -> &'input str + = $(digit()+) + + rule number() -> Value + = e:$("-"? digits() "." digits()) {Value::Number(Number::from_f64(e.parse::().unwrap()).unwrap())} + / e:$("-" digits()) {Value::Number(e.parse::().unwrap().into())} + / e:digits() {Value::Number(e.parse::().unwrap().into())} + + rule boolean() -> Value + = ("true" / "True") { Value::Bool(true)} + / ("false" / "False") { Value::Bool(false)} + + rule null() -> Value + = ("null" / "None") { Value::Null} + + rule _string() -> String + = "b"? "'" parts:((_str_part_sq() / echar())*) "'" {parts.into_iter().collect()} + / "b"? "\"" parts:((_str_part_dq() / echar())*) "\"" {parts.into_iter().collect()} + + rule _str_part_sq() -> Cow<'input, str> + = s:$([^'\'' | '\\']+) {Cow::Borrowed(s)} + + rule _str_part_dq() -> Cow<'input, str> + = s:$([^'"' | '\\']+) {Cow::Borrowed(s)} + + rule string() -> Value + = s:_string() {Value::String(s)} + + rule echar() -> Cow<'input, str> + = "\\t" {"\t".into()} + / "\\r" {"\r".into()} + / "\\n" {"\n".into()} + / "\\u" x:$(['a'..='f'| 'A'..='F'| '0'..='9']*<4,4>) { + let s = format!("0x{}", x); + let c = u32::from_str_radix(x, 16).expect("invalid hex"); + String::from_iter([char::from_u32(c).expect("invalid cahracter")]).into() + } + / "\\U" x:$(['a'..='f'| 'A'..='F'| '0'..='9']*<8,8>) { + let s = format!("0x{}", x); + let c = u32::from_str_radix(x, 16).expect("invalid hex"); + String::from_iter([char::from_u32(c).expect("invalid cahracter")]).into() + } + / "\\" c:$([_]) {c.into()} + + rule list() -> Value + = "[" e:(expr() ** ",") _ ","? "]" {Value::Array(e)} + + rule object() -> Value + = "{" entries:(object_entry() ** ",") _ ","? "}" {Value::Object(entries.into_iter().collect())} + + rule object_entry() -> (String, Value) + = _ k:_string() _ ":" v:expr() {(k, v)} + + rule _expr() -> Value + = null() + / boolean() + / number() + / string() + / list() + / object() + + pub rule expr() -> Value = _ e:_expr() _ {e} +} +} + +fn realize_value<'py>(py: Python<'py>, ast: Value) -> PyResult> { + match ast { + Value::Null => Ok(PyNone::get(py).to_owned().into_any()), + Value::Bool(val) => Ok(PyBool::new(py, val).to_owned().into_any()), + Value::Number(n) => { + if let Some(i) = n.as_i64() { + Ok(PyInt::new(py, i).into_any()) + } else if let Some(x) = n.as_f64() { + Ok(PyFloat::new(py, x).into_any()) + } else { + Err(PyValueError::new_err(format!("invalid number {:?}", n))) + } + } + Value::String(s) => Ok(PyString::new(py, &s).into_any()), + Value::Array(list) => { + let out = PyList::empty(py); + for elt in list { + out.append(realize_value(py, elt)?)?; + } + + Ok(out.into_any()) + } + Value::Object(dict) => { + let out = PyDict::new(py); + + for (k, v) in dict { + out.set_item(k, realize_value(py, v)?)?; + } + + Ok(out.into_any()) + } + } +} diff --git a/src/lenskit/_accel/data.pyi b/src/lenskit/_accel/data.pyi index 8ff274290..a2a2a309f 100644 --- a/src/lenskit/_accel/data.pyi +++ b/src/lenskit/_accel/data.pyi @@ -2,6 +2,7 @@ from __future__ import annotations import numpy as np import pyarrow as pa +from pydantic import JsonValue from typing_extensions import TypeVar from lenskit.data.matrix import SparseRowArray @@ -52,6 +53,7 @@ def sample_negatives( seed: int, ) -> np.ndarray[tuple[int, int], np.dtype[np.int32]]: ... def hash_array(arr: pa.Array, /) -> str: ... +def pyon_loads(data: str) -> JsonValue: ... class IDIndex: """ diff --git a/src/lenskit/cli/data/convert.py b/src/lenskit/cli/data/convert.py index f46d09b84..c6648967d 100644 --- a/src/lenskit/cli/data/convert.py +++ b/src/lenskit/cli/data/convert.py @@ -8,8 +8,6 @@ import click -from lenskit.data import load_amazon_ratings, load_movielens -from lenskit.data.msweb import load_ms_web from lenskit.logging import get_logger _log = get_logger(__name__) @@ -19,6 +17,7 @@ @click.option("--movielens", "format", flag_value="movielens", help="Convert MovieLens data.") @click.option("--amazon", "format", flag_value="amazon", help="Convert Amazon rating data.") @click.option("--ms-web", "format", flag_value="ms-web", help="Convert MSWeb visit logs.") +@click.option("--steam", "format", flag_value="steam", help="Convert Steam interaction data") @click.option( "--item-lists", is_flag=True, help="Convert to an ItemListCollection instead of Dataset." ) @@ -40,14 +39,25 @@ def convert(format: str | None, src: list[Path], dst: Path, item_lists: bool = F _log.error("no data format specified") raise click.UsageError("no data format specified") case "movielens": + from lenskit.data.sources.movielens import load_movielens + log.info("loading MovieLens data") if len(src) != 1: log.error("received %d source paths, MovieLens only takes one", len(src)) + data = load_movielens(src[0]) case "amazon": + from lenskit.data.sources.amazon import load_amazon_ratings + data = load_amazon_ratings(*src) case "ms-web": + from lenskit.data.sources.msweb import load_ms_web + data = load_ms_web(src[0]) + case "steam": + from lenskit.data.sources.steam import load_steam + + data = load_steam(src[0]) case _: raise ValueError(f"unknown data format {format}") diff --git a/src/lenskit/cli/data/describe.py b/src/lenskit/cli/data/describe.py index 8b4f17b48..176a52d5f 100644 --- a/src/lenskit/cli/data/describe.py +++ b/src/lenskit/cli/data/describe.py @@ -12,7 +12,7 @@ from rich.console import Console from rich.markdown import Markdown -from lenskit.data import Dataset, load_amazon_ratings, load_movielens +from lenskit.data import Dataset from lenskit.data._summary import save_stats from lenskit.logging import get_logger @@ -20,27 +20,40 @@ @click.command("describe") -@click.option("--movielens", "format", flag_value="movielens", help="describe MovieLens data") -@click.option("--amazon", "format", flag_value="amazon", help="describe Amazon rating data") +@click.option("--movielens", "format", flag_value="movielens", help="Describe MovieLens data.") +@click.option("--amazon", "format", flag_value="amazon", help="Describe Amazon rating data.") +@click.option("--steam", "format", flag_value="steam", help="Describe Steam interaction data.") @click.option("--markdown", is_flag=True, help="output raw Markdown") -@click.argument("path", type=Path) -def describe(format: str | None, markdown: bool, path: Path): +@click.argument("path", type=Path, nargs=-1, required=True) +def describe(format: str | None, markdown: bool, path: list[Path]): """ Describe a data set. """ - log = _log.bind(path=str(path)) + if len(path) == 1: + log = _log.bind(path=str(path[0])) + else: + log = _log.bind(path=[str(p) for p in path]) match format: case None: log.info("loading LensKit native data") - data = Dataset.load(path) + data = Dataset.load(path[0]) case "movielens": + from lenskit.data.sources.movielens import load_movielens + log.info("loading MovieLens data") - data = load_movielens(path) + data = load_movielens(path[0]) case "amazon": + from lenskit.data.sources.amazon import load_amazon_ratings + log.info("loading Amazon data") - data = load_amazon_ratings(path) + data = load_amazon_ratings(path[0]) + case "steam": + from lenskit.data.sources.steam import load_steam + + log.info("loading Steam data") + data = load_steam(*path) case _: raise ValueError(f"unknown data format {format}") diff --git a/src/lenskit/data/_builder.py b/src/lenskit/data/_builder.py index 90f48c220..b0728200e 100644 --- a/src/lenskit/data/_builder.py +++ b/src/lenskit/data/_builder.py @@ -828,8 +828,7 @@ def add_scalar_attribute( raise DataError(f"no entities of class {cls}") nums = self._resolve_entity_ids(cls, entities, e_tbl) - if not np.all(nums.is_valid()): # pragma: nocover - n_bad = nums.is_valid().sum().as_py() + if n_bad := nums.null_count: # pragma: nocover raise DataError(f"{n_bad} unknown entity IDs") val_array: pa.Array = pa.array(values) # type: ignore @@ -915,8 +914,7 @@ def add_list_attribute( if e_tbl is None: # pragma: nocover raise DataError(f"no entities of class {cls}") nums = self._resolve_entity_ids(cls, entities, e_tbl) - if not np.all(nums.is_valid()): # pragma: nocover - n_bad = nums.is_valid().sum().as_py() + if n_bad := nums.null_count: # pragma: nocover raise DataError(f"{n_bad} unknown entity IDs") val_array: pa.Array = pa.array(values) # type: ignore @@ -980,8 +978,7 @@ def add_vector_attribute( if e_tbl is None: # pragma: nocover raise DataError(f"no entities of class {cls}") nums = self._resolve_entity_ids(cls, entities, e_tbl) - if not np.all(nums.is_valid()): # pragma: nocover - n_bad = nums.is_valid().sum().as_py() + if n_bad := nums.null_count: # pragma: nocover raise DataError(f"{n_bad} unknown entity IDs") tbl_valid = np.zeros(e_tbl.num_rows, dtype=np.bool_) diff --git a/src/lenskit/data/msweb.py b/src/lenskit/data/msweb.py index 060634cc2..39ef6990b 100644 --- a/src/lenskit/data/msweb.py +++ b/src/lenskit/data/msweb.py @@ -5,102 +5,20 @@ # SPDX-License-Identifier: MIT """ -Support for the MSWeb datasets. -""" - -import csv -from pathlib import Path -from typing import Literal, overload - -import pyarrow as pa -from xopen import xopen - -from lenskit.logging import get_logger - -from ._builder import DatasetBuilder -from ._collection import ItemListCollection -from ._dataset import Dataset +Legacy location of the MS Web import functions. -_log = get_logger(__name__) +.. deprecated:: 2026.3 + Import from :mod:`lenskit.data` or :mod:`lenskit.data.sources.msweb` instead. +""" -@overload -def load_ms_web(path: Path, format: Literal["dataset"] = "dataset") -> Dataset: ... -@overload -def load_ms_web(path: Path, format: Literal["collection"]) -> ItemListCollection: ... -@overload -def load_ms_web( - path: Path, format: Literal["dataset", "collection"] = "dataset" -) -> Dataset | ItemListCollection: ... -def load_ms_web( - path: Path, format: Literal["dataset", "collection"] = "dataset" -) -> Dataset | ItemListCollection: - """ - Load the MSWeb data set. - - The Microsoft Anonymous Web data set was published by - :cite:t:`breeseEmpiricalAnalysisPredictive1998`, and is available from the - `UCI repository`_. - - This function can load the data either as a :class:`Dataset` (useful for - training) or as an :class:`ItemListCollection` (for evaluation). - - .. _UCI repository: https://kdd.ics.uci.edu/databases/msweb/msweb.html - - Args: - path: - The path to the data file (gzip-compressed). - format: - The type of object to load the data set into. - Returns: - The loaded MSWeb data. - """ - ds = _load_ms_dataset(path) - match format: - case "collection": - return ds.interactions("visit").item_lists() - case "dataset": - return ds - case _: # pragma: nocover - raise ValueError(f"invalid format: {format}") +import warnings +from .sources.msweb import load_ms_web -def _load_ms_dataset(path: Path) -> Dataset: - item_ids = [] - item_titles = [] - item_urls = [] - session_votes = [] - cur_session = None - _log.info("opening MSWeb file", path=str(path)) - with xopen(path, "rt") as data: - reader = csv.reader(data) - for row in reader: - code = row[0] - match code: - case "A": - _c, vid, _n, title, url = row - item_ids.append(int(vid)) - item_titles.append(title) - item_urls.append(url) - case "C": - _c, _sname, sid = row - cur_session = int(sid) - case "V": - _c, vid, _n = row - session_votes.append({"session_id": cur_session, "item_id": int(vid)}) +__all__ = ["load_ms_web"] - dsb = DatasetBuilder("ms-web") - dsb.add_entities("item", item_ids) - dsb.add_scalar_attribute("item", "title", item_ids, item_titles) - dsb.add_scalar_attribute("item", "url", item_ids, item_urls) - dsb.add_entity_class("session") - votes = pa.Table.from_pylist(session_votes) - dsb.add_interactions( - "visit", - votes, - entities=["session", "item"], - missing="insert", - default=True, - allow_repeats=False, - ) - return dsb.build() +warnings.warn( + "lenskit.data.msweb deprecated, use lenskit.data or lenskit.data.sources", + DeprecationWarning, +) diff --git a/src/lenskit/data/sources/msweb.py b/src/lenskit/data/sources/msweb.py new file mode 100644 index 000000000..e8408b921 --- /dev/null +++ b/src/lenskit/data/sources/msweb.py @@ -0,0 +1,106 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University. +# Copyright (C) 2023-2026 Drexel University. +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +""" +Support for the MSWeb datasets. +""" + +import csv +from pathlib import Path +from typing import Literal, overload + +import pyarrow as pa +from xopen import xopen + +from lenskit.logging import get_logger + +from .._builder import DatasetBuilder +from .._collection import ItemListCollection +from .._dataset import Dataset + +_log = get_logger(__name__) + + +@overload +def load_ms_web(path: Path, format: Literal["dataset"] = "dataset") -> Dataset: ... +@overload +def load_ms_web(path: Path, format: Literal["collection"]) -> ItemListCollection: ... +@overload +def load_ms_web( + path: Path, format: Literal["dataset", "collection"] = "dataset" +) -> Dataset | ItemListCollection: ... +def load_ms_web( + path: Path, format: Literal["dataset", "collection"] = "dataset" +) -> Dataset | ItemListCollection: + """ + Load the MSWeb data set. + + The Microsoft Anonymous Web data set was published by + :cite:t:`breeseEmpiricalAnalysisPredictive1998`, and is available from the + `UCI repository`_. + + This function can load the data either as a :class:`Dataset` (useful for + training) or as an :class:`ItemListCollection` (for evaluation). + + .. _UCI repository: https://kdd.ics.uci.edu/databases/msweb/msweb.html + + Args: + path: + The path to the data file (gzip-compressed). + format: + The type of object to load the data set into. + Returns: + The loaded MSWeb data. + """ + ds = _load_ms_dataset(path) + match format: + case "collection": + return ds.interactions("visit").item_lists() + case "dataset": + return ds + case _: # pragma: nocover + raise ValueError(f"invalid format: {format}") + + +def _load_ms_dataset(path: Path) -> Dataset: + item_ids = [] + item_titles = [] + item_urls = [] + session_votes = [] + cur_session = None + _log.info("opening MSWeb file", path=str(path)) + with xopen(path, "rt") as data: + reader = csv.reader(data) + for row in reader: + code = row[0] + match code: + case "A": + _c, vid, _n, title, url = row + item_ids.append(int(vid)) + item_titles.append(title) + item_urls.append(url) + case "C": + _c, _sname, sid = row + cur_session = int(sid) + case "V": + _c, vid, _n = row + session_votes.append({"session_id": cur_session, "item_id": int(vid)}) + + dsb = DatasetBuilder("ms-web") + dsb.add_entities("item", item_ids) + dsb.add_scalar_attribute("item", "title", item_ids, item_titles) + dsb.add_scalar_attribute("item", "url", item_ids, item_urls) + dsb.add_entity_class("session") + votes = pa.Table.from_pylist(session_votes) + dsb.add_interactions( + "visit", + votes, + entities=["session", "item"], + missing="insert", + default=True, + allow_repeats=False, + ) + return dsb.build() diff --git a/src/lenskit/data/sources/steam.py b/src/lenskit/data/sources/steam.py new file mode 100644 index 000000000..aa034954b --- /dev/null +++ b/src/lenskit/data/sources/steam.py @@ -0,0 +1,185 @@ +# This file is part of LensKit. +# Copyright (C) 2018-2023 Boise State University. +# Copyright (C) 2023-2026 Drexel University. +# Licensed under the MIT license, see LICENSE.md for details. +# SPDX-License-Identifier: MIT + +""" +Support for importing `Steam data`_. + +.. _Steam data: https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data +""" + +import sys +from collections.abc import Generator +from itertools import batched +from pathlib import Path +from typing import Any + +import numpy as np +import pyarrow as pa +import pyarrow.compute as pc +from xopen import xopen + +from lenskit._accel import data as _accel_data +from lenskit.data import Dataset, DatasetBuilder +from lenskit.diagnostics import DataError +from lenskit.logging import get_logger + +_log = get_logger(__name__) + +BATCH_SIZE = 10_000 +AU_USERS_ITEMS_SCHEMA = pa.schema( + { + "user_id": pa.string(), + "items_count": pa.int32(), + "steam_id": pa.string(), + "user_url": pa.string(), + "items": pa.list_( + pa.struct( + [ + pa.field("item_id", pa.string()), + pa.field("item_name", pa.string()), + pa.field("playtime_2weeks", pa.float32()), + pa.field("playtime_forever", pa.float32()), + ] + ) + ), + } +) + + +def load_steam(*files: Path, reviews: bool = False) -> Dataset: + """ + Load a `Steam dataset`_ from Julian McAuley's group at UCSD. + + .. _Steam dataset: https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data + + .. note:: + + This function uses filenames to detect which version of the data to + load, so the files should be named exactly as they are from McAuley's + download site, optionally recompressed. + + Args: + files: + Input files to read, in any order. Can only load one version of + the data at a time (Australian or full). + reviews: + Set to ``True`` to include review text in the loaded dataset. + """ + au_interactions = None + au_reviews = None + all_games = None + all_reviews = None + + # autodetect our file(s) + for file in files: + name = file.name + if name.startswith("australian_user_reviews.json"): + au_reviews = file + elif name.startswith("australian_users_items.json"): + au_interactions = file + elif name.startswith("steam_games.json"): + all_games = file + elif name.startswith("steam_reviews.json"): + all_reviews = file + + if au_interactions is not None: + _log.debug("looking for Australian subset interactions") + if all_reviews is not None or all_games is not None: + _log.error("cannot specify both Australian and overall input files") + raise DataError("invalid combination of Steam input files") + + return _load_au_steam(au_interactions, au_reviews if reviews else None) + + elif all_reviews is not None: + _log.debug("looking for full-data inteactions") + if au_reviews is not None: + _log.error("cannot specify both Australian and overall input files") + raise DataError("invalid combination of Steam input files") + + return _load_all_steam(all_games, all_reviews, include_reviews=reviews) + else: + _log.error("must supply one of australian_users_items or steam_reviews") + raise DataError("no Steam interactions provided") + + +def _load_au_steam(interactions: Path, reviews: Path | None) -> Dataset: + dsb = DatasetBuilder() + + ui_data = _read_table(interactions, AU_USERS_ITEMS_SCHEMA) + + _log.debug("de-duplicating Steam data", rows=ui_data.num_rows) + _uq_ids, uq_rows = np.unique(ui_data.column("steam_id"), return_index=True, sorted=False) + uq_rows = pa.array(uq_rows) + ui_data = ui_data.take(uq_rows) + + _log.info("loaded Steam data", users=ui_data.num_rows) + + items = ui_data.column("items") + + _log.debug("loading items") + ii_flat_chunks = [c.flatten() for c in items.chunks] + ii_tbl: pa.Table = pa.table( + { + "item_id": pa.chunked_array(c.field("item_id") for c in ii_flat_chunks), + "item_name": pa.chunked_array(c.field("item_name") for c in ii_flat_chunks), + } + ) + item_info: pa.Table = ii_tbl.group_by("item_id").aggregate([("item_name", "distinct")]) + item_info = item_info.append_column( + "title", pc.list_element(item_info.column("item_name_distinct"), 0) + ).drop_columns("item_name_distinct") + item_info = item_info.filter(item_info.column("item_id").is_valid()) + _log.debug("item schema: %s", item_info.schema) + _log.info("adding items", count=item_info.num_rows) + dsb.add_entities("item", item_info.column("item_id")) + dsb.add_scalar_attribute("item", "title", item_info) + + _log.info("adding users", count=ui_data.num_rows) + users = ui_data.select(["steam_id", "user_id"]).rename_columns(["user_id", "username"]) + dsb.add_entities("user", users) + + _log.info("adding user-item interactions") + # TODO: make DSB work better with CSR-shaped data + + return dsb.build() + + +def _load_all_steam(games: Path | None, reviews: Path, *, include_reviews: bool) -> Dataset: + raise NotImplementedError() + + +def _read_table(path: Path, schema: pa.Schema | None = None) -> pa.Table: + _log.debug("reading table from loose JSON", file=str(path)) + tbl = pa.Table.from_batches(_decode_chunks(path, schema)) + _log.debug("finished reading table", rows=tbl.num_rows, file=str(path)) + return tbl + + +def _decode_chunks(path: Path, schema: pa.Schema | None = None) -> Generator[pa.RecordBatch]: + for chunk in batched(_decode_steam(path), BATCH_SIZE): + batch = pa.RecordBatch.from_pylist(chunk, schema) + if schema is None: + schema = batch.schema + yield batch + + +def _decode_steam(path: Path) -> Generator[dict[str, Any]]: + """ + Decode a stream of malformed JSON. + """ + with xopen(path, "rt") as stream: + for line in stream: + yield _accel_data.pyon_loads(line) + + +def _preview_file(path: str | Path): + path = Path(path) + batch = next(_decode_chunks(path)) + print(batch.schema) + + +if __name__ == "__main__": + _preview_file(sys.argv[1]) diff --git a/src/lenskit/testing/_msweb.py b/src/lenskit/testing/_msweb.py index ea2f078df..9737676e2 100644 --- a/src/lenskit/testing/_msweb.py +++ b/src/lenskit/testing/_msweb.py @@ -8,7 +8,7 @@ from pytest import fixture, skip -from lenskit.data.msweb import load_ms_web +from lenskit.data.sources.msweb import load_ms_web from lenskit.splitting import TTSplit MSWEB_TRAIN = Path("data/anonymous-msweb.data.gz") diff --git a/tests/cli/count-tests.pl b/tests/cli/count-tests.pl index b40030bc9..b040da6c2 100755 --- a/tests/cli/count-tests.pl +++ b/tests/cli/count-tests.pl @@ -8,7 +8,7 @@ print STDERR "counting tests\n" if $verbose; while (<>) { - if (m/^(run-(lenskit|python|command)|require)\b/) { + if (m/^\s*(run-(lenskit|python|command)|require)\b/) { print STDERR "test: $_" if $verbose; $test_count += 1; } diff --git a/tests/cli/test-data-convert.sh b/tests/cli/test-data-convert.sh index 08d4281d0..2538e2a54 100644 --- a/tests/cli/test-data-convert.sh +++ b/tests/cli/test-data-convert.sh @@ -4,3 +4,11 @@ require -f "$TEST_WORK/ml-data/schema.json" name="$(jq -r .name "$TEST_WORK/ml-data/schema.json")" require "$name" = ml-latest-small + +if [[ -f data/australian_users_items.json.gz ]]; then + run-lenskit data convert --steam data/australian_users_items.json.gz "$TEST_WORK/steam-au-data" + require -d "$TEST_WORK/steam-au-data" + require -f "$TEST_WORK/steam-au-data/schema.json" +else + skip 3 +fi diff --git a/uv.lock b/uv.lock index 5cfdaee3a..b32775cb3 100644 --- a/uv.lock +++ b/uv.lock @@ -3248,45 +3248,45 @@ wheels = [ [[package]] name = "pyarrow" -version = "23.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, - { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, - { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, - { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, - { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, - { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, - { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, - { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, - { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, - { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, - { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, - { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, - { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, - { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, - { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, - { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, - { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, - { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, - { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, - { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, - { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, - { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, - { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, - { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, - { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, - { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, - { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, - { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, - { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, - { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, - { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, - { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, - { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, - { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, - { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, +version = "24.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" }, + { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" }, + { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" }, + { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759, upload-time = "2026-04-21T10:48:07.258Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471, upload-time = "2026-04-21T10:48:13.347Z" }, + { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981, upload-time = "2026-04-21T10:48:20.201Z" }, + { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172, upload-time = "2026-04-21T10:48:27.541Z" }, + { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733, upload-time = "2026-04-21T10:48:34.7Z" }, + { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335, upload-time = "2026-04-21T10:48:42.099Z" }, + { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748, upload-time = "2026-04-21T10:49:42.532Z" }, + { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554, upload-time = "2026-04-21T10:48:48.526Z" }, + { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301, upload-time = "2026-04-21T10:48:55.181Z" }, + { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929, upload-time = "2026-04-21T10:49:03.676Z" }, + { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365, upload-time = "2026-04-21T10:49:11.714Z" }, + { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819, upload-time = "2026-04-21T10:49:21.474Z" }, + { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252, upload-time = "2026-04-21T10:49:31.164Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127, upload-time = "2026-04-21T10:49:37.334Z" }, + { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997, upload-time = "2026-04-21T10:49:48.796Z" }, + { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720, upload-time = "2026-04-21T10:49:55.858Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852, upload-time = "2026-04-21T10:50:04.624Z" }, + { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852, upload-time = "2026-04-21T10:50:12.293Z" }, + { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207, upload-time = "2026-04-21T10:50:20.677Z" }, + { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117, upload-time = "2026-04-21T10:50:29.14Z" }, + { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155, upload-time = "2026-04-21T10:51:22.337Z" }, + { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387, upload-time = "2026-04-21T10:50:35.552Z" }, + { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102, upload-time = "2026-04-21T10:50:42.417Z" }, + { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118, upload-time = "2026-04-21T10:50:49.324Z" }, + { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765, upload-time = "2026-04-21T10:50:55.579Z" }, + { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890, upload-time = "2026-04-21T10:51:02.439Z" }, + { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250, upload-time = "2026-04-21T10:51:10.576Z" }, + { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282, upload-time = "2026-04-21T10:51:16.815Z" }, ] [[package]]