archaeo_super_prompt.modeling.pdf_to_text

[docs] package archaeo_super_prompt.modeling.pdf_to_text
"""PDF Ingestion layer with vision llm and chunking model."""from pathlib importPathfrom typing importLiteral,overridefrom tqdm importtqdmfrom ...types.pdfchunks importPDFChunkDatasetfrom ...types.pdfpaths import(PDFPathDataset,)from ..types.base_transformer importBaseTransformerfrom . importchunkingasvllm_doc_chunk_modfrom . importstream_ocr_manualasvllm_scan_modclass VLLM_Preprocessing(BaseTransformer):[docs]
    """First PDF ingestion layer for the pipeline. Include vision-llm scan and text chunking.    This pipeline FunctionTransformer directly takes in input a batch of paths    of PDF files to be ingested. It read the text with a vision-llm and output    text chunks with being aware to the layout and a tokenization method to be    provided.    """def __init__(self,vlm_provider:Literal["ollama","vllm","openai"],vlm_model_id:str,prompt:str,embedding_model_hf_id:str,incipit_only:bool,max_chunk_size:int=512,allowed_timeout:int=60*5,):        """Provide the vlm model credentials and other parametres.        Arguments:            vlm_provider: the remote service to connect to            vlm_model_id: the reference of the vision-llm to be called on the Ollama server            prompt: a string to contextualize the ocr operation of the vision llm            embedding_model_hf_id: the identifier on HuggingFace API of the embedding model, so its tokenizer can be fetched            incipit_only: if only the first pages are scanned or all the document            max_chunk_size: the maximum size of all text chunks            allowed_timeout: the maximum duration for scanning text from one PDF page        Environment variable:            The VLM_HOST_URL env var must be set like this :            http://localhost:8005         """# store the parameters for loggingself.vlm_provider=vlm_providerself.vlm_model_id=vlm_model_idself.prompt=promptself.embedding_model_hf_id=embedding_model_hf_idself.incipit_only=incipit_onlyself.max_chunk_size=max_chunk_sizeself.allowed_timeout=allowed_timeoutself._chunker=vllm_doc_chunk_mod.get_chunker(embedding_model_hf_id,max_chunk_size)@overridedef transform(self,X:PDFPathDataset)->PDFChunkDataset:[docs]
# instantiate the converter at runtime so the environment variable of# the endpoint of the vlm is not cached if the instance of the# Transformer is cached by joblib, as in standard sklearn workflowsconverter=vllm_scan_mod.converter(vllm_scan_mod.vllm_vlm_options(self.vlm_model_id,self.prompt,allowed_timeout=self.allowed_timeout)ifself.vlm_provider!="ollama"elsevllm_scan_mod.ollama_vlm_options(self.vlm_model_id,self.prompt,allowed_timeout=self.allowed_timeout))conversion_results=vllm_scan_mod.process_documents([(line["id"],Path(line["filepath"]))for_,lineinX.iterrows()],converter,self.incipit_only,)chunked_results=iter(tqdm(((f,vllm_doc_chunk_mod.get_chunks(self._chunker,r))forf,rinconversion_results),desc="Chunking read text",unit="chunked files",total=len(X),))returnvllm_doc_chunk_mod.chunk_to_ds(chunked_results,self._chunker)