Skip to content

archaeo_super_prompt.modeling.legacy_predict

[docs] module archaeo_super_prompt.modeling.legacy_predict

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""Legacy model for comparison."""

from pandera.typing.pandas import DataFrame
from sklearn.pipeline import FunctionTransformer, Pipeline
import sklearn

from .entity_extractor.types import (
    ChunksWithThesaurus,
)
from .struct_extract.language_model import get_vllm_model

from ..types.pdfchunks import PDFChunkDatasetSchema


from .pdf_to_text import VLLM_Preprocessing
from .struct_extract.legacy_extractor.main_transformer import MagohDataExtractor


def get_legacy_model():
    """Return the legacy model but with the vllm as pre-processing layer."""
    llm_model = get_vllm_model(temperature=0.05)
    with sklearn.config_context(transform_output="pandas"):
        return Pipeline(
            [
                (
                    "vllm",
                    VLLM_Preprocessing(
                        vlm_provider='vllm',
                        vlm_model_id="ibm-granite/granite-vision-3.3-2b",
                        prompt="OCR this part of Italian document for markdown-based processing.",
                        embedding_model_hf_id="nomic-ai/nomic-embed-text-v1.5",
                        incipit_only=True,
                    ),
                ),
                ("extractor", MagohDataExtractor(llm_model)),
            ],
        )