archaeo_super_prompt.modeling.struct_extract.extractors.comune

[docs] package archaeo_super_prompt.modeling.struct_extract.extractors.comune
"""Comune LLM extractor."""import refrom typing importcast,overrideimport dspyimport pydanticfrom pandera.typing.pandas importSeriesfrom archaeo_super_prompt.dataset.load importMagohDatasetfrom archaeo_super_prompt.dataset.thesauri importload_comune_with_provinciefrom archaeo_super_prompt.modeling.struct_extract.types import(InputForExtractionWithSuggestedThesauri,InputForExtractionWithSuggestedThesauriRowSchema,)from archaeo_super_prompt.types.intervention_id importInterventionIdfrom .....types.per_intervention_feature import(BasePerInterventionFeatureSchema,)from ...field_extractor importFieldExtractor,LLMProvider,to_prediction# -- DSPy partclass Comune(pydantic.BaseModel):[docs]
    """Questo elemento fornisce informazioni su un comune. È possibile trovare questo tipo di informazioni nel testo."""citta_nome:strprovicia_nome:strprovincia_sigla:strclass IdentificaComune(dspy.Signature):[docs]
    """Identifica il unico comune in cui si sono svolti i lavori archeologici descritti in questi frammenti di relazione. I comuni possibili sono indicati."""fragmenti_relazione:str=dspy.InputField(desc="In ogni frammento sono indicati il nome del file pdf e la sua posizione nel file.")possibili_comuni:list[Comune]=dspy.InputField(desc="Scegliete un di questi comuni")comune:str=dspy.OutputField(desc="Il nome completo del comune")provincia:str=dspy.OutputField(desc="Il nome completo della provincia")class ComuneInputData(pydantic.BaseModel):[docs]
    """Chunks of reports of an archaeological intervention with supposed information about the comune where the operations took place.    Identified likely comuni with their province are also provided to help in the extraction.    """fragmenti_relazione:strpossibili_comuni:list[Comune]class ComuneOutputData(pydantic.BaseModel):[docs]
    """A predicted comune where the intervention took place, with its provincia."""comune:strprovincia:strclass FindComune(dspy.Module):[docs]
    """DSPy model for the extraction of the comune."""def __init__(self):        """Initialize only a chain of thought."""self._estrattore_di_comune=dspy.ChainOfThought(IdentificaComune)def forward([docs]
self,fragmenti_relazione:str,possibili_comuni:list[Comune])->dspy.Prediction:        """Direct forward."""predicted_output=cast(dspy.Prediction,self._estrattore_di_comune(fragmenti_relazione=fragmenti_relazione,possibili_comuni=possibili_comuni,),)WRONG_COMUNE="%ERROR_COMUNE%"WRONG_PROVINCIA="%ERROR_PROVINCIA%"returnto_prediction(ComuneOutputData(comune=cast(str,predicted_output.get("comune",WRONG_COMUNE)),provincia=cast(str,predicted_output.get("provincia",WRONG_PROVINCIA)),))# -- SKlearn partclass ComuneFeatSchema(BasePerInterventionFeatureSchema):[docs]
    """Extracted data about the Comune."""comune_id:intprovincia_id:intclass ComuneExtractor([docs]
FieldExtractor[ComuneInputData,ComuneOutputData,InputForExtractionWithSuggestedThesauri,InputForExtractionWithSuggestedThesauriRowSchema,ComuneFeatSchema,]):    """Dspy-LLM-based extractor of the comune data."""def __init__(self,llm_model_provider:LLMProvider,llm_model_id:str,llm_temperature:float,)->None:        """Initialize the extractor with providing it the llm which will be used."""example=(ComuneInputData(fragmenti_relazione=""""Relazione_scavo.pdf, Pagina 1 :L'evento si è svolto a Lucca.""",possibili_comuni=[Comune(citta_nome="Lucca",provicia_nome="Lucca",provincia_sigla="LU",)],),ComuneOutputData(comune="Lucca",provincia="Lucca"),)# TODO: load this more lazilyself._thesaurus=load_comune_with_provincie()super().__init__(llm_model_provider,llm_model_id,llm_temperature,FindComune(),example,ComuneOutputData,)@overridedef _to_dspy_input(self,x)->ComuneInputData:comuni,province=self._thesauruspossible_comuni=comuni.iloc[x.identified_thesaurus].merge(province,on="province_id",suffixes=("_comune","_province"))returnComuneInputData(fragmenti_relazione=x.merged_chunks,possibili_comuni=[Comune(citta_nome=cast(str,c.name_comune),provicia_nome=cast(str,c.name_province),provincia_sigla=cast(str,c.sigla),)forcinpossible_comuni.itertuples()],)@overridedef _transform_dspy_output(self,y):comuni,province=self._thesaurusreturnComuneFeatSchema.validate(self._identity_output_set_transform_to_df(y).assign(schedaid=lambdadf:df.index).merge(province[["name"]].assign(provincia_id=province.index),left_on="provincia",right_on="name",)[["schedaid","comune","provincia_id"]].merge(comuni.assign(comune_id=comuni.index),left_on=["comune","provincia_id"],right_on=["name","province_id"],)[["schedaid","comune_id","provincia_id"]].rename(columns={"schedaid":"id"}).set_index("id"),# TODO: add this after tests# lazy=True)@override@classmethoddef _compare_values(cls,predicted,expected):TRESHOLD=0.95return0.7*int(predicted.comune==expected.comune)+0.3*int(predicted.provincia==expected.provincia),TRESHOLD@override@classmethoddef filter_training_dataset([docs]
cls,y:MagohDataset,ids:set[InterventionId])->set[InterventionId]:returny.filter_good_records_for_training(ids,lambdadf:cast(Series[bool],df["university__Comune"].notnull()),)@override@classmethoddef _select_answers(cls,y:MagohDataset,ids:set[InterventionId])->dict[InterventionId,ComuneOutputData]:def to_comune_data(comune_string:str|None)->ComuneOutputData:default_output=ComuneOutputData(comune="",provincia="")ifcomune_stringisNone:returndefault_outputpattern=r"^(.*?) \((.*?)\)$"match=re.match(pattern,comune_string)ifmatch:comune,provincia=match.groups()returnComuneOutputData(comune=comune,provincia=provincia)returndefault_outputreturn{InterventionId(t.id):to_comune_data(t.university__Comune)fortiny.get_answers(ids)}@override@staticmethoddef field_to_be_extracted():[docs]
return"comune"