archaeo_super_prompt.modeling.other_dag

[docs] module archaeo_super_prompt.modeling.other_dag
"""DAGs to train the FieldExtractor models."""from typing importNamedTuplefrom archaeo_super_prompt.modeling.struct_extract.field_extractor import(FieldExtractor,)from ..dataset.thesauri importload_comunefrom .DAG_builder importDAGBuilder,DAGComponentfrom .entity_extractor importNerModel,NeSelectorfrom .pdf_to_text importVLLM_Preprocessingfrom .struct_extract.chunks_to_text importChunksToTextfrom .struct_extract.extractors.archiving_date importArchivingDateProviderfrom .struct_extract.extractors.comune importComuneExtractorfrom .struct_extract.extractors.intervention_date import(InterventionStartExtractor,)class ExtractionDAGParts(NamedTuple):[docs]
    """A decomposition of the general DAG into different parts for a better handling between the training, the inference and the evaluation modes."""preprocessing_root:DAGBuilderextraction_parts:list[tuple[DAGComponent[FieldExtractor],DAGComponent]]final_component:tuple[DAGComponent,list[DAGComponent]]def get_advanced_pipeline()->DAGBuilder:[docs]
    """Return the most advanced pre-processing DAG for the model.    All its estimators and transformers are initialized with particular    parametres.    Return:        A part of the complete DAG for getting the pre-processed data.        The field extractors related to their parent node, to apply on these extractors special training or evaluation operations or to bind them to the preprocessing dag        The final union component to finish the building of the complete DAG        in inference mode.    """llm_model_id="google/gemma-3-27b-it"llm_provider="vllm"llm_model_temp=0.05vllm=DAGComponent("vision-lm-Reader",VLLM_Preprocessing(vlm_provider="vllm",vlm_model_id="ibm-granite/granite-vision-3.3-2b",incipit_only=True,prompt="OCR this part of Italian document for markdown-based processing.",embedding_model_hf_id="nomic-ai/nomic-embed-text-v1.5",),)ner=DAGComponent("NER-Extractor",NerModel())ner_featured=DAGComponent("ner-featured","passthrough")archiving_date=DAGComponent("archiving-date-Oracle",ArchivingDateProvider())intervention_date_chunk_filter=DAGComponent("interv-start-CF",NeSelector("data",{"DATA",},lambda:list(enumerate(["primavera","estate","autunno","inverno",])),True,),)intervention_date_chunk_merger=DAGComponent("interv-start-CM",ChunksToText())intervention_date_extractor=DAGComponent("interv-start-Extractor",InterventionStartExtractor(llm_provider,llm_model_id,llm_model_temp),)comune_extractor=DAGComponent("comune-Extractor",ComuneExtractor(llm_provider,llm_model_id,llm_model_temp),)comune_chunk_filter=DAGComponent("comune-CF",NeSelector("comune",{"INDIRIZZO","CODICE_POSTALE","LUOGO",},load_comune,),)comune_chunk_merger=DAGComponent("comune-CM",ChunksToText())intervention_date_entrypoint=DAGComponent("interv-start-entrypoint","passthrough")final_results=DAGComponent[FieldExtractor]("FINAL","passthrough")fi_entrypoint=DAGComponent("fonte-informaz-entrypoint","passthrough")fonte_informazione=DAGComponent("fonte-informaz-Deductor",ArchivingDateProvider())functionary_selector=DAGComponent("functionary-CF",NeSelector("functionary",{"NOME","COGNOME"},lambda:[],True),)functionary_merger=DAGComponent("functionary-CM",ChunksToText())functionary_entrypoint=DAGComponent("functionary-entrypoint","passthrough")functionary=DAGComponent("functionary-Extractor",ArchivingDateProvider())return(DAGBuilder().add_node(vllm).add_node(ner,[vllm]).add_node(ner_featured,[vllm,ner]).add_node(archiving_date,[vllm]).add_linearly_chained_nodes([comune_chunk_filter,comune_chunk_merger],[ner_featured],).add_linearly_chained_nodes([intervention_date_chunk_filter,intervention_date_chunk_merger],[ner_featured],).add_node(intervention_date_entrypoint,[intervention_date_chunk_merger,archiving_date],).add_node(intervention_date_extractor,[intervention_date_entrypoint]).add_node(comune_extractor,[comune_chunk_merger]).add_linearly_chained_nodes([fi_entrypoint,fonte_informazione],[archiving_date,comune_extractor],).add_linearly_chained_nodes([functionary_selector,functionary_merger],[ner_featured]).add_node(functionary_entrypoint,[functionary_merger,intervention_date_extractor,comune_extractor]).add_node(functionary,[functionary_entrypoint]).add_node(final_results,[archiving_date,comune_extractor,intervention_date_extractor,fonte_informazione,functionary]))