archaeo_super_prompt.dataset.load

[docs] module archaeo_super_prompt.dataset.load
"""Module gathering the loaders for a training/evaluation dataset.The data are slightly-transformed records of the Magoh database."""from collections.abc importCallablefrom typing importNamedTupleimport pandas as pdfrom pandera.typing.pandas importDataFrame,Seriesfrom tqdm importtqdmfrom ..types.intervention_id importInterventionIdfrom ..types.pdfpaths importPDFPathSchemafrom ..types.structured_data import(ExtractedStructuredDataSeries,FeaturedOutputStructureDataSchema,OutputStructuredDataSchema,outputStructuredDataSchema_itertuples,)from .postgresql_engine importget_entries,get_entries_with_idsfrom .minio_engine importdownload_filesfrom ..utils.cache importget_memory_forfrom ..utils.norm importvariabilize_column_namedef _parse_intervention_data(intervention_data__df:pd.DataFrame):filtered_df=(intervention_data__df.filter(regex="^(scheda_intervento.id|intervention_start_date_*|duration_(value|precision)|(university|building|check).*)").astype({"university.Numero di saggi":"UInt32","university.Geologico":"boolean","university.Profondità falda":"Float64","university.Profondità massima":"Float64","duration_value":"UInt32"}))returnfiltered_dfdef _parse_and_get_files(intervention_data:pd.DataFrame):intervention_data=_parse_intervention_data(intervention_data)files=PDFPathSchema.validate(pd.concat([pd.DataFrame([{"id":id_,"filepath":str(path.resolve())}forpathindownload_files(id_)])forid_intqdm(intervention_data["scheda_intervento.id"],desc="Downloaded files",unit="interventions",)],ignore_index=True,))returnintervention_data,files@get_memory_for("external").cachedef _init_with_cache(size:int,seed:float,only_recent_entries=False):intervention_data,findings=get_entries(size,seed,only_recent_entries)intervention_data,files=_parse_and_get_files(intervention_data)returnintervention_data,findings,files@get_memory_for("external").cachedef _init_with_cache_for_ids(ids:set[int]):intervention_data,findings=get_entries_with_ids(ids)intervention_data,files=_parse_and_get_files(intervention_data)returnintervention_data,findings,filesclass SamplingParams(NamedTuple):[docs]
    """Parametres for sampling records in the training dataset."""size:intseed:floatonly_recent_entries:bool"""A set of interventions identified by their schedaid."""typeIdSet=set[int]class MagohDataset:[docs]
    """Class to interact with the general training/evaluation dataset.    At the initialisation, fetch the data from the cache or from the remote    dataset if needed.    """def __init__(self,params:IdSet|SamplingParams):        """Fetch intervention records from the Magoh's training database.        Args:            params: a set of intervention identifiers to be fetched or a group \of sampling params to randomly fetch intervention records         """ifisinstance(params,SamplingParams):size,seed,only_recent_entries=paramsintervention_data,self._findings,self._files=_init_with_cache(size,seed,only_recent_entries)else:intervention_data,self._findings,self._files=(_init_with_cache_for_ids(params))self._intervention_data=self._normalize_metadata_df(intervention_data)@propertydef intervention_data(self):[docs]
        """A DataFrame with the truth metadata of registered records in Magoh."""returnself._intervention_data@propertydef legacy_intervention_data([docs]
self,)->DataFrame[OutputStructuredDataSchema]:        """The intervention data in the old schema for the legacy model."""returnOutputStructuredDataSchema.validate(self._intervention_data.filter(regex="^(id|(university|building)__*)"),# TODO: add this once the method is tested# lazy=True)@classmethoddef _normalize_metadata_df(cls,df:pd.DataFrame)->DataFrame[FeaturedOutputStructureDataSchema]:returnFeaturedOutputStructureDataSchema.validate(df.filter(regex="^(scheda_intervento.id|intervention_start_date_*|duration_(value|precision)|(university|building).*)").rename(columns={"scheda_intervento.id":"id"}).rename(columns=variabilize_column_name),# TODO: add this once the method is tested# lazy=True)[docs]
def get_answer(self,id_:InterventionId)->ExtractedStructuredDataSeries:        """Return the metadata of a magoh record with the given id."""records=self._intervention_datarecord=records[records["id"]==id_]iflen(record)==0:raiseException(f"Unable to get record with id {id_}")returnrecord.iloc[0].to_dict()def filter_good_records_for_training([docs]
self,ids:set[InterventionId],condition:Callable[[DataFrame[FeaturedOutputStructureDataSchema]],Series[bool]],)->set[InterventionId]:        """Return only the ids for which the intervention records match a given condition.        Args:            ids: the set of interventions to select            condition: a function taking the training metadata dataframe and \returning a series of boolean to filter the records with unusable values         """only_ids=self._intervention_data[self._intervention_data["id"].isin(ids)]returnset(InterventionId(id_)forid_inonly_ids[condition(only_ids)]["id"].to_list())def get_answers(self,ids:set[InterventionId]):[docs]
        """Return the answers for each of the asked interventions."""records=self._intervention_datafiltered=records[records["id"].isin(ids)]iflen(filtered)!=len(ids):raiseException("All the asked interventions does not have their answers stored in the dataset")returnoutputStructuredDataSchema_itertuples(filtered)@propertydef findings(self):[docs]
        """Return a dataframe with the fetched findings data."""returnself._findingsdef get_files_for_batch(self,ids:set[InterventionId]):[docs]
        """Return the files only realted to the given intervention ids."""returnself._files[self._files["id"].isin(ids)]@propertydef files(self):[docs]
        """Return all the files with their related intervention id."""returnself._files