archaeo_super_prompt.modeling.train

[docs] module archaeo_super_prompt.modeling.train
"""DAGs to train the FieldExtractor models."""from typing importNamedTuple,castfrom archaeo_super_prompt.dataset.load importMagohDatasetfrom archaeo_super_prompt.modeling.struct_extract.field_extractor import(FieldExtractor,)from ..dataset.thesauri importload_comunefrom ..types.pdfpaths importPDFPathDatasetfrom ..utils.result importget_model_store_dirfrom .DAG_builder importDAGBuilder,DAGComponentfrom .entity_extractor importNerModel,NeSelectorfrom .pdf_to_text importVLLM_Preprocessingfrom .struct_extract.chunks_to_text importChunksToTextfrom .struct_extract.extractors.archiving_date importArchivingDateProviderfrom .struct_extract.extractors.comune importComuneExtractorfrom .struct_extract.extractors.intervention_date import(InterventionStartExtractor,)class ExtractionDAGParts(NamedTuple):[docs]
    """A decomposition of the general DAG into different parts for a better handling between the training, the inference and the evaluation modes."""preprocessing_root:DAGBuilderextraction_parts:list[tuple[DAGComponent[FieldExtractor],DAGComponent]]final_component:tuple[DAGComponent,list[DAGComponent]]def get_training_dag()->ExtractionDAGParts:[docs]
    """Return the most advanced pre-processing DAG for the model.    All its estimators and transformers are initialized with particular    parametres.    Return:        A part of the complete DAG for getting the pre-processed data.        The field extractors related to their parent node, to apply on these extractors special training or evaluation operations or to bind them to the preprocessing dag        The final union component to finish the building of the complete DAG        in inference mode.    """llm_model_id="google/gemma-3-27b-it"llm_provider="vllm"llm_model_temp=0.05vllm=DAGComponent("vision-lm-Reader",VLLM_Preprocessing(vlm_provider="vllm",vlm_model_id="ibm-granite/granite-vision-3.3-2b",incipit_only=True,prompt="OCR this part of Italian document for markdown-based processing.",embedding_model_hf_id="nomic-ai/nomic-embed-text-v1.5",),)ner=DAGComponent("NER-Extractor",NerModel())ner_featured=DAGComponent("ner-featured","passthrough")archiving_date=DAGComponent("archiving-date-Oracle",ArchivingDateProvider())intervention_date_chunk_filter=DAGComponent("interv-start-CF",NeSelector("data",{"DATA",},lambda:list(enumerate(["primavera","estate","autunno","inverno",])),True,),)intervention_date_chunk_merger=DAGComponent("interv-start-CM",ChunksToText())intervention_date_extractor=DAGComponent("interv-start-Extractor",InterventionStartExtractor(llm_provider,llm_model_id,llm_model_temp),)comune_extractor=DAGComponent("comune-Extractor",ComuneExtractor(llm_provider,llm_model_id,llm_model_temp),)comune_chunk_filter=DAGComponent("comune-CF",NeSelector("comune",{"INDIRIZZO","CODICE_POSTALE","LUOGO",},load_comune,),)comune_chunk_merger=DAGComponent("comune-CM",ChunksToText())intervention_date_entrypoint=DAGComponent("interv-start-entrypoint","passthrough")final_results=DAGComponent[FieldExtractor]("FINAL","passthrough")preprocessing_part=(DAGBuilder().add_node(vllm).add_node(ner,[vllm]).add_node(ner_featured,[vllm,ner]).add_node(archiving_date,[vllm]).add_linearly_chained_nodes([comune_chunk_filter,comune_chunk_merger],[ner_featured],).add_linearly_chained_nodes([intervention_date_chunk_filter,intervention_date_chunk_merger],[ner_featured],).add_node(intervention_date_entrypoint,[intervention_date_chunk_merger,archiving_date],))extraction_part=cast(list[tuple[DAGComponent[FieldExtractor],DAGComponent]],[(intervention_date_extractor,intervention_date_entrypoint),(comune_extractor,comune_chunk_merger),],)final_part=(final_results,[archiving_date,intervention_date_extractor,comune_extractor,],)returnExtractionDAGParts(preprocessing_part,extraction_part,final_part)[docs]
def train_from_scratch(training_input:PDFPathDataset,ds:MagohDataset)->ExtractionDAGParts:    """Return the most advanced DAG model, fitted from the data.    Apply a training for each FieldExtractor model.    """preprocessing_part,extraction_part,final_part=get_training_dag()preprocess_pipeline=preprocessing_part.make_dag()preprocessed_inputs=preprocess_pipeline.fit_transform(training_input,ds)forfe_component,depinextraction_part:field_extractor=fe_component.componentifisinstance(field_extractor,str):# impossiblecontinuefield_extractor.fit(preprocessed_inputs[dep.component_id],ds)field_extractor.prompt_model_.save(get_model_store_dir()/f"{fe_component.component_id}.json")returnExtractionDAGParts(preprocessing_part,extraction_part,final_part)def get_fitted_model(training_input:PDFPathDataset,ds:MagohDataset):[docs]
    """Return the most advanced DAG model, mockly fitted from the data.    The FieldExtractor model are supposed already fitted from saved dspy    models in get_model_store_dir() path.    """preprocessing_part,extraction_part,final_part=get_training_dag()preprocess_pipeline=preprocessing_part.make_dag()preprocessed_inputs=preprocess_pipeline.fit_transform(training_input,ds)forfe_component,depinextraction_part:field_extractor=fe_component.componentifisinstance(field_extractor,str):# impossiblecontinuefield_extractor.fit(preprocessed_inputs[dep.component_id],ds,compiled_dspy_model_path=get_model_store_dir()/f"{fe_component.component_id}.json",)returnExtractionDAGParts(preprocessing_part,extraction_part,final_part)# TODO: set the inference from the paths and the evaluation