Skip to content

Storages

bocoel.Storage

Bases: Protocol

Storage is responsible for storing the data. This can be thought of as a table.

__len__ abstractmethod

__len__() -> int

Returns the number of rows in the storage.

Source code in src/bocoel/corpora/storages/interfaces.py
21
22
23
24
25
26
27
@abc.abstractmethod
def __len__(self) -> int:
    """
    Returns the number of rows in the storage.
    """

    ...

_getitem abstractmethod

_getitem(idx: int) -> Mapping[str, Any]

Returns the row at the given index.

Source code in src/bocoel/corpora/storages/interfaces.py
49
50
51
52
53
54
55
@abc.abstractmethod
def _getitem(self, idx: int) -> Mapping[str, Any]:
    """
    Returns the row at the given index.
    """

    ...

bocoel.PandasStorage

PandasStorage(df: DataFrame)

Bases: Storage

Storage for pandas DataFrame. Since pandas DataFrames are in-memory, this storage is fast, but might be memory inefficient and require a lot of RAM.

Source code in src/bocoel/corpora/storages/pandas.py
19
20
def __init__(self, df: DataFrame, /) -> None:
    self._df = df

from_jsonl_file classmethod

from_jsonl_file(path: str | Path) -> Self

Load data from a JSONL file.

Parameters:

Name Type Description Default
path str | Path

The path to the file.

required

Returns:

Type Description
Self

A PandasStorage instance.

Source code in src/bocoel/corpora/storages/pandas.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
@classmethod
def from_jsonl_file(cls, path: str | Path, /) -> Self:
    """
    Load data from a JSONL file.

    Parameters:
        path: The path to the file.

    Returns:
        A `PandasStorage` instance.
    """

    path = Path(path)

    if not path.exists():
        raise FileNotFoundError(path)

    if not path.is_file():
        raise ValueError(f"Cannot open file: {path}")

    with open(path) as f:
        lines = map(lambda s: s.strip("\n"), f.readlines())

    data = [json.loads(line) for line in lines]
    return cls.from_jsonl(data)

from_jsonl classmethod

from_jsonl(data: Sequence[Mapping[str, str]]) -> Self

Load data from a JSONL object or a list of JSON.

Parameters:

Name Type Description Default
data Sequence[Mapping[str, str]]

The JSONL object or list of JSON.

required

Returns:

Type Description
Self

A PandasStorage instance.

Source code in src/bocoel/corpora/storages/pandas.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@classmethod
def from_jsonl(cls, data: Sequence[Mapping[str, str]], /) -> Self:
    """
    Load data from a JSONL object or a list of JSON.

    Parameters:
        data: The JSONL object or list of JSON.

    Returns:
        A `PandasStorage` instance.
    """

    df = DataFrame.from_records(data)
    return cls(df)

bocoel.DatasetsStorage

DatasetsStorage(path: str, name: str | None = None, split: str | None = None)

Bases: Storage

Storage for datasets from HuggingFace Datasets library. Datasets are loaded on disk, so they might be slow(er) to load, but are more memory efficient.

Source code in src/bocoel/corpora/storages/datasets.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(
    self, path: str, name: str | None = None, split: str | None = None
) -> None:
    # Optional dependency.
    import datasets
    from datasets import DatasetDict

    self._path = path
    self._name = name
    self._split = split

    ds = datasets.load_dataset(path=path, name=name, trust_remote_code=True)

    if split:
        if not isinstance(ds, DatasetDict):
            raise ValueError("Split is not supported for this dataset")

        ds = ds[split]

    self._dataset = ds

bocoel.ConcatStorage

ConcatStorage(storages: Sequence[Storage])

Bases: Storage

Storage that concatenates multiple storages together. Concatenation is done on the first dimension. The resulting storage is read-only and has length equal to the sum of the lengths of the storages.

Source code in src/bocoel/corpora/storages/concat.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(self, storages: Sequence[Storage], /) -> None:
    if len(storages) < 1:
        raise ValueError("At least one storage is required")

    diff_keys = set(frozenset(store.keys()) for store in storages)
    if len(diff_keys) > 1:
        raise ValueError("Keys are not equal")

    # Unpack the only key in `diff_keys`.
    (self._keys,) = diff_keys
    self._storages = tuple(storages)

    LOGGER.info("Concat storage created", storages=storages, keys=diff_keys)

    storage_lengths = [len(store) for store in self._storages]
    self._prefix_sum = np.cumsum(storage_lengths).tolist()
    self._length = sum(storage_lengths)