archaeo_super_prompt.types.pdfchunks

[docs] module archaeo_super_prompt.types.pdfchunks
"""Abstract data type for handling a dataset of read pdfs."""from pandas importconcatfrom pandera.pandas importDataFrameModelfrom pandera.typing importDataFrame,Seriesfrom typing importNewType,TypedDict,castfrom collections.abc importGenerator,Iterable# TODO: remove these dependenciesfrom .text_for_extractor import(Chunk,ChunkHumanDescription,Filename,PDFChunkEnumeration,PDFSources,)from .intervention_id importInterventionId# TODO: stronger type checkingclass PDFChunkSetPerInterventionSchema(DataFrameModel):[docs]
filename:Series[str]chunk_type:list[str]chunk_page_position:list[int]chunk_index:Series[int]chunk_embedding_content:Series[str]chunk_content:Series[str]class PDFChunkDatasetSchema(PDFChunkSetPerInterventionSchema):[docs]
id:Series[int]PDFChunkDataset=NewType("PDFChunkDataset",DataFrame[PDFChunkDatasetSchema])class PDFChunkPerInterventionDataset:[docs]
    """DataFrame class wrapper to customize the auto-displaying from tracing tools such as mlflow."""def __init__(self,data:DataFrame[PDFChunkSetPerInterventionSchema],)->None:self.data=datadef __add__(self,otherDF:"PDFChunkPerInterventionDataset")->"PDFChunkPerInterventionDataset":returnPDFChunkPerInterventionDataset(PDFChunkSetPerInterventionSchema.validate(self.data.combine_first(otherDF.data),lazy=True),)def getExtractedPdfContent(self)->PDFSources:[docs]
        """Let dataset be a set of chunks from several pdf files related to a single intervention. Computes the batch of chunk sources from this dataset.        The dataset can be partial if a selection of chunks in each files has        already been carried out.        """def items_for_pdf_source(fileChunks:PDFChunkDataset):def process_row(row_):row=PDFChunk(cast(PDFChunk,row_.to_dict()))tag_description=(row["chunk_type"])description=ChunkHumanDescription(f"Chunk {row['chunk_index']} ({tag_description} page {row['chunk_page_position']})")returndescription,row["chunk_content"]returndict(process_row(row)for_,rowinfileChunks.iterrows())return{Filename(cast(str,filename)):items_for_pdf_source(PDFChunkDataset(cast(PDFChunkDataset,fileChunks)))forfilename,fileChunksinself.data.groupby("filename")}def to_readable_context_string(self)->PDFChunkEnumeration:[docs]
msg:str=""for_,chunkinself.data.iterrows():msg+=f"`%% {chunk['filename']} | Page {chunk['chunk_page_position']} ({[str(label) for label in chunk['chunk_type']]}) %%`\n\n"msg+=chunk["chunk_content"]+"\n"*2msg+="`"+"-"*60+"`\n\n"returnPDFChunkEnumeration(msg)def __str__(self)->str:returnself.to_readable_context_string()PDFChunk=TypedDict("PDFChunk",{"id":InterventionId,"filename":Filename,"chunk_type":str,"chunk_page_position":str,# fraction: page number over total page number"chunk_index":int,"chunk_content":Chunk,},)"""NB: this type of row is unnormalized for a memory-efficient processing butthis might not be an issue in our pipeline, as the datasets are not huge andthe time processing wille be negligible next to the LLM and Embedding modelinferences"""def composePdfChunkDataset([docs]
datasets:Generator[PDFChunkDataset]|Iterable[PDFChunkDataset],)->PDFChunkDataset:returnPDFChunkDataset(cast(DataFrame,concat(datasets)))def buildPdfChunkDataset(chunks:list[PDFChunk])->PDFChunkDataset:[docs]
returnPDFChunkDataset(DataFrame(chunks))