Embedders

bocoel.Embedder

Bases: Protocol

Embedders are responsible for encoding text into vectors. Embedders in this project are considered volatile because it requires CPU time, unless some database with encoder capability is used.

batch `abstractmethod` `property`

batch: int

The batch size to use when encoding.

dims `abstractmethod` `property`

dims: int

The dimensions of the embeddings

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name	Type	Description	Default
`storage`	`Storage`	The storage to encode.	required
`transform`	`Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]`	The transformation function to use.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(storage), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name	Type	Description	Default
`text`	`Sequence[str]`	The text to encode.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(text), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

_encode `abstractmethod`

_encode(texts: Sequence[str]) -> Tensor

The actual encode function.

Parameters:

Name	Type	Description	Default
`texts`	`Sequence[str]`	The texts to encode.	required

Returns:

Type	Description
`Tensor`	The encoded embeddings. The shape must be `[len(texts), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

@abc.abstractmethod
def _encode(self, texts: Sequence[str], /) -> Tensor:
    """
    The actual encode function.

    Parameters:
        texts: The texts to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(texts), self.dims]`.
    """

    ...

bocoel.SbertEmbedder

SbertEmbedder(
    model_name: str = "all-mpnet-base-v2",
    device: str = "cpu",
    batch_size: int = 64,
)

Bases: Embedder

Sentence-BERT embedder. Uses the sentence_transformers library.

Initializes the Sbert embedder.

Parameters:

Name	Type	Description	Default
`model_name`	`str`	The model name to use.	`'all-mpnet-base-v2'`
`device`	`str`	The device to use.	`'cpu'`
`batch_size`	`int`	The batch size for encoding.	`64`

Raises:

Type	Description
`ImportError`	If sentence_transformers is not installed.

Source code in src/bocoel/corpora/embedders/sberts.py

def __init__(
    self,
    model_name: str = "all-mpnet-base-v2",
    device: str = "cpu",
    batch_size: int = 64,
) -> None:
    """
    Initializes the Sbert embedder.

    Parameters:
        model_name: The model name to use.
        device: The device to use.
        batch_size: The batch size for encoding.

    Raises:
        ImportError: If sentence_transformers is not installed.
    """

    # Optional dependency.
    from sentence_transformers import SentenceTransformer

    self._name = model_name
    self._sbert = SentenceTransformer(model_name, device=device)

    self._batch_size = batch_size

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name	Type	Description	Default
`storage`	`Storage`	The storage to encode.	required
`transform`	`Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]`	The transformation function to use.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(storage), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name	Type	Description	Default
`text`	`Sequence[str]`	The text to encode.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(text), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

bocoel.HuggingfaceEmbedder

HuggingfaceEmbedder(
    path: str,
    device: str = "cpu",
    batch_size: int = 64,
    transform: Callable[[Any], Tensor] = lambda: output.logits,
)

Bases: Embedder

Huggingface embedder. Uses the transformers library. Not a traditional encoder but uses a classifier and logits as embeddings.

Initializes the Huggingface embedder.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the model.	required
`device`	`str`	The device to use.	`'cpu'`
`batch_size`	`int`	The batch size for encoding.	`64`
`transform`	`Callable[[Any], Tensor]`	The transformation function to use.	`lambda : logits`

Raises:

Type	Description
`ImportError`	If transformers is not installed.
`ValueError`	If the model does not have a `config.id2label` attribute.

Source code in src/bocoel/corpora/embedders/huggingface.py

def __init__(
    self,
    path: str,
    device: str = "cpu",
    batch_size: int = 64,
    transform: Callable[[Any], Tensor] = lambda output: output.logits,
) -> None:
    """
    Initializes the Huggingface embedder.

    Parameters:
        path: The path to the model.
        device: The device to use.
        batch_size: The batch size for encoding.
        transform: The transformation function to use.

    Raises:
        ImportError: If transformers is not installed.
        ValueError: If the model does not have a `config.id2label` attribute.
    """

    # Optional dependency.
    from transformers import AutoModelForSequenceClassification, AutoTokenizer

    self._path = path
    self._model = AutoModelForSequenceClassification.from_pretrained(path)
    self._tokenizer = AutoTokenizer.from_pretrained(path)
    self._batch_size = batch_size

    self._device = device
    self._model = self._model.to(device)
    self._transform = transform

    try:
        self._dims = len(self._model.config.id2label)
    except AttributeError as e:
        raise ValueError(
            "The model must have a `config.id2label` attribute to determine the number of classes."
        ) from e

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name	Type	Description	Default
`storage`	`Storage`	The storage to encode.	required
`transform`	`Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]`	The transformation function to use.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(storage), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name	Type	Description	Default
`text`	`Sequence[str]`	The text to encode.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(text), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

bocoel.EnsembleEmbedder

EnsembleEmbedder(embedders: Sequence[Embedder], sequential: bool = False)

Bases: Embedder

An ensemble of embedders. The embeddings are concatenated together.

Parameters:

Name	Type	Description	Default
`embedders`	`Sequence[Embedder]`	The embedders to use.	required
`sequential`	`bool`	Whether to use sequential processing.	`False`

Raises:

Type	Description
`ValueError`	If the embedders have different batch sizes.

Source code in src/bocoel/corpora/embedders/ensemble.py

def __init__(self, embedders: Sequence[Embedder], sequential: bool = False) -> None:
    """
    Parameters:
        embedders: The embedders to use.
        sequential: Whether to use sequential processing.

    Raises:
        ValueError: If the embedders have different batch sizes.
    """

    # Check if all embedders have the same batch size.
    self._embedders = embedders
    self._batch_size = embedders[0].batch
    if len(set(emb.batch for emb in embedders)) != 1:
        raise ValueError("All embedders must have the same batch size")

    self._sequential = sequential

    cpus = os.cpu_count()
    assert cpus is not None
    self._cpus = cpus

encode_storage

encode_storage(
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray

Encodes the storage into embeddings.

Parameters:

Name	Type	Description	Default
`storage`	`Storage`	The storage to encode.	required
`transform`	`Callable[[Mapping[str, Sequence[Any]]], Sequence[str]]`	The transformation function to use.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(storage), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode_storage(
    self,
    storage: Storage,
    /,
    transform: Callable[[Mapping[str, Sequence[Any]]], Sequence[str]],
) -> NDArray:
    """
    Encodes the storage into embeddings.

    Parameters:
        storage: The storage to encode.
        transform: The transformation function to use.

    Returns:
        The encoded embeddings. The shape must be `[len(storage), self.dims]`.
    """

    results: list[NDArray] = []

    for idx in tqdm(range(0, len(storage), self.batch)):
        LOGGER.debug(
            "Encoding storage",
            storage=storage,
            batch_size=self.batch,
            idx=idx,
            total=len(storage),
        )
        batch = storage[idx : idx + self.batch]
        texts = transform(batch)
        encoded = self.encode(texts)
        results.append(encoded)

    return np.concatenate(results, axis=0)

encode

encode(text: Sequence[str]) -> NDArray

Calls the encode function and performs some checks. Would try to encode the text in batches.

Parameters:

Name	Type	Description	Default
`text`	`Sequence[str]`	The text to encode.	required

Returns:

Type	Description
`NDArray`	The encoded embeddings. The shape must be `[len(text), self.dims]`.

Source code in src/bocoel/corpora/embedders/interfaces.py

def encode(self, text: Sequence[str], /) -> NDArray:
    """
    Calls the encode function and performs some checks.
    Would try to encode the text in batches.

    Parameters:
        text: The text to encode.

    Returns:
        The encoded embeddings. The shape must be `[len(text), self.dims]`.
    """

    with torch.no_grad():
        encoded = self._encode(text)

    if (dim := encoded.shape[-1]) != self.dims:
        raise ValueError(
            f"Expected the encoded embeddings to have dimension {self.dims}, got {dim}"
        )

    return encoded.cpu().numpy()

Embedders

bocoel.Embedder

batch abstractmethod property

dims abstractmethod property

encode_storage

encode

_encode abstractmethod

bocoel.SbertEmbedder

encode_storage

encode

bocoel.HuggingfaceEmbedder

encode_storage

encode

bocoel.EnsembleEmbedder

encode_storage

encode

batch `abstractmethod` `property`

dims `abstractmethod` `property`

_encode `abstractmethod`