archaeo_super_prompt.modeling.struct_extract.chunks_to_text

[docs] module archaeo_super_prompt.modeling.struct_extract.chunks_to_text
"""Management of the prompt attachment creation."""from typing importcastfrom pandera.typing.pandas importDataFrameimport pandas as pdfrom sklearn.pipeline importFunctionTransformerfrom ..entity_extractor.types importChunksWithThesaurusfrom .types importInputForExtractionWithSuggestedThesaurifrom tqdm importtqdmdef ChunksToText():[docs]
    """Unifies the filtered chunks into one attachment text for an LLM prompt.    This pipeline Transformer applies this chunk merge for each intervention.    """# TODO: define a unique ChunksWithSuggestedValues, regardless if its a# thesaurus identifier, an identified number, etc.def to_readable_context_string(filtered_chunks:DataFrame[ChunksWithThesaurus],)->str:msg:str=""for_,chunkinfiltered_chunks.sort_values(by="chunk_index").iterrows():msg+=f"`%% {chunk['filename']} | Page {chunk['chunk_page_position']} ({chunk['chunk_type']}) %%`\n\n"msg+=chunk["chunk_content"]+"\n"*2msg+="`"+"-"*60+"`\n\n"returnmsgdef unify_thesaurus(X:DataFrame[ChunksWithThesaurus]):returnset().union(*X["identified_thesaurus"].tolist())def ChunksToPromptContent(X:DataFrame[ChunksWithThesaurus],)->DataFrame[InputForExtractionWithSuggestedThesauri]:returnInputForExtractionWithSuggestedThesauri.validate(pd.DataFrame((lambdafiltered_chunks:{"id":id_,"merged_chunks":to_readable_context_string(filtered_chunks),"identified_thesaurus":list(unify_thesaurus(filtered_chunks)),})(cast(DataFrame[ChunksWithThesaurus],filtered_chunks))forid_,filtered_chunksintqdm(X.groupby("id"),"Gathering filtered chunks",unit="intervention",)).set_index("id"))returnFunctionTransformer(ChunksToPromptContent)