Skip to content

Embedders

bocoel.Embedder

Bases: Protocol

Embedders are responsible for encoding text into vectors. Embedders in this project are considered volatile because it requires CPU time, unless some database with encoder capability is used.

batch abstractmethod property

batch: int

The batch size to use when encoding.

dims abstractmethod property

dims: int

The dimensions of the embeddings

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name Type Description Default
storage Storage

The storage to encode.

required
transform Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]

The transformation function to use.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(storage), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name Type Description Default
text Sequence[str]

The text to encode.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(text), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

_encode abstractmethod

_encode(texts: Sequence[str]) -> Tensor

The actual encode function.

Parameters:

Name Type Description Default
texts Sequence[str]

The texts to encode.

required

Returns:

Type Description
Tensor

The encoded embeddings. The shape must be [len(texts), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
103
104
105
106
107
108
109
110
111
112
113
114
115
@abc.abstractmethod
def _encode(self, texts: Sequence[str], /) -> Tensor:
    """
    The actual encode function.

    Parameters:
        texts: The texts to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(texts), self.dims]`.
    """

    ...

bocoel.SbertEmbedder

SbertEmbedder(
    model_name: str = "all-mpnet-base-v2",
    device: str = "cpu",
    batch_size: int = 64,
)

Bases: Embedder

Sentence-BERT embedder. Uses the sentence_transformers library.

Initializes the Sbert embedder.

Parameters:

Name Type Description Default
model_name str

The model name to use.

'all-mpnet-base-v2'
device str

The device to use.

'cpu'
batch_size int

The batch size for encoding.

64

Raises:

Type Description
ImportError

If sentence_transformers is not installed.

Source code in src/bocoel/corpora/embedders/sberts.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(
    self,
    model_name: str = "all-mpnet-base-v2",
    device: str = "cpu",
    batch_size: int = 64,
) -> None:
    """
    Initializes the Sbert embedder.

    Parameters:
        model_name: The model name to use.
        device: The device to use.
        batch_size: The batch size for encoding.

    Raises:
        ImportError: If sentence_transformers is not installed.
    """

    # Optional dependency.
    from sentence_transformers import SentenceTransformer

    self._name = model_name
    self._sbert = SentenceTransformer(model_name, device=device)

    self._batch_size = batch_size

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name Type Description Default
storage Storage

The storage to encode.

required
transform Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]

The transformation function to use.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(storage), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name Type Description Default
text Sequence[str]

The text to encode.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(text), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

bocoel.HuggingfaceEmbedder

HuggingfaceEmbedder(
    path: str,
    device: str = "cpu",
    batch_size: int = 64,
    transform: Callable[[Any], Tensor] = lambda: output.logits,
)

Bases: Embedder

Huggingface embedder. Uses the transformers library. Not a traditional encoder but uses a classifier and logits as embeddings.

Initializes the Huggingface embedder.

Parameters:

Name Type Description Default
path str

The path to the model.

required
device str

The device to use.

'cpu'
batch_size int

The batch size for encoding.

64
transform Callable[[Any], Tensor]

The transformation function to use.

lambda : logits

Raises:

Type Description
ImportError

If transformers is not installed.

ValueError

If the model does not have a config.id2label attribute.

Source code in src/bocoel/corpora/embedders/huggingface.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    path: str,
    device: str = "cpu",
    batch_size: int = 64,
    transform: Callable[[Any], Tensor] = lambda output: output.logits,
) -> None:
    """
    Initializes the Huggingface embedder.

    Parameters:
        path: The path to the model.
        device: The device to use.
        batch_size: The batch size for encoding.
        transform: The transformation function to use.

    Raises:
        ImportError: If transformers is not installed.
        ValueError: If the model does not have a `config.id2label` attribute.
    """

    # Optional dependency.
    from transformers import AutoModelForSequenceClassification, AutoTokenizer

    self._path = path
    self._model = AutoModelForSequenceClassification.from_pretrained(path)
    self._tokenizer = AutoTokenizer.from_pretrained(path)
    self._batch_size = batch_size

    self._device = device
    self._model = self._model.to(device)
    self._transform = transform

    try:
        self._dims = len(self._model.config.id2label)
    except AttributeError as e:
        raise ValueError(
            "The model must have a `config.id2label` attribute to determine the number of classes."
        ) from e

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name Type Description Default
storage Storage

The storage to encode.

required
transform Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]

The transformation function to use.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(storage), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name Type Description Default
text Sequence[str]

The text to encode.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(text), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

bocoel.EnsembleEmbedder

EnsembleEmbedder(embedders: Sequence[Embedder], sequential: bool = False)

Bases: Embedder

An ensemble of embedders. The embeddings are concatenated together.

Parameters:

Name Type Description Default
embedders Sequence[Embedder]

The embedders to use.

required
sequential bool

Whether to use sequential processing.

False

Raises:

Type Description
ValueError

If the embedders have different batch sizes.

Source code in src/bocoel/corpora/embedders/ensemble.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def __init__(self, embedders: Sequence[Embedder], sequential: bool = False) -> None:
    """
    Parameters:
        embedders: The embedders to use.
        sequential: Whether to use sequential processing.

    Raises:
        ValueError: If the embedders have different batch sizes.
    """

    # Check if all embedders have the same batch size.
    self._embedders = embedders
    self._batch_size = embedders[0].batch
    if len(set(emb.batch for emb in embedders)) != 1:
        raise ValueError("All embedders must have the same batch size")

    self._sequential = sequential

    cpus = os.cpu_count()
    assert cpus is not None
    self._cpus = cpus

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name Type Description Default
storage Storage

The storage to encode.

required
transform Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]

The transformation function to use.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(storage), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name Type Description Default
text Sequence[str]

The text to encode.

required

Returns:

Type Description
NDArray

The encoded embeddings. The shape must be [len(text), self.dims].

Source code in src/bocoel/corpora/embedders/interfaces.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()