archaeo_super_prompt.dataset.normalization.intervention_date.utils

[docs] module archaeo_super_prompt.dataset.normalization.intervention_date.utils
"""Utils for piping normalization functions."""from collections.abc importCallablefrom typing importLiteral,NamedTuple,Optionalimport pandas as pdfrom pandera.pandas importDataFrameModel,Fieldfrom pandera.typing.pandas importDataFrame,SeriesPrecision=Literal["day","month","year"]class Date(NamedTuple):[docs]
    """Not completely normalized date, but the day, the month and the year are already separated by /."""start_date:str# d/m/yend_date:str# d/m/yprecision:Precisionclass Duration(NamedTuple):[docs]
    """Tuple to represent a uniform duration."""nb:intunit:Precisionclass RawInterventionDataForDateNormalization(DataFrameModel):[docs]
    """This is the schema of usefull columns for normalizing the intervention dates."""idscheda:Series[int]data_protocollo:Optional[Series[str]]=Field(nullable=True)# noqa: UP045data_intervento:Series[str]anno:Series[int]norm_duration:Optional[tuple[int,Precision]]=Field(nullable=True)# noqa: UP045class InterventionDataForDateNormalization(DataFrameModel):[docs]
    """This is the schema of usefull columns for normalizing the intervention dates."""idscheda:Series[int]data_protocollo:Optional[Series[str]]=Field(nullable=True)# noqa: UP045data_intervento:Series[str]anno:Series[pd.Int32Dtype]norm_duration:Optional[tuple[int,Precision]]=Field(nullable=True)# noqa: UP045norm_date:Optional[Date]=Field(nullable=True)# noqa: UP045class InterventionDataForDateNormalizationRowSchema(NamedTuple):[docs]
    """Row schema of the class above."""idscheda:intdata_protocollo:strdata_intervento:stranno:intnorm_duration:Duration|Nonenorm_date:Date|NoneDateProcessor=Callable[[InterventionDataForDateNormalizationRowSchema],Date|None]def process_if_not_yet([docs]
row:InterventionDataForDateNormalizationRowSchema,fn:DateProcessor)->Date|None:    """For each row not processed yet, apply a normalization function.    This normalization function try to normalize if the humanly-input date    matches with patterns that it supports. Else, it returns None.    """current_answer=row.norm_dateifcurrent_answerisnotNone:returncurrent_answerreturnfn(row)def pipe([docs]
s:DataFrame[RawInterventionDataForDateNormalization],functions:tuple[DateProcessor,...],)->DataFrame[InterventionDataForDateNormalization]:    """Apply to the raw date df a range of normalization functions.    This functions tries to cover a maximum of humanly-input dates.    """def pipe_aux(s:DataFrame[InterventionDataForDateNormalization],functions:tuple[DateProcessor,...],):ifnotfunctions:returnsreturnpipe_aux(s.assign(norm_date=lambdadf:df.apply(lambdarow:process_if_not_yet(InterventionDataForDateNormalizationRowSchema(*tuple(row)),functions[0],),axis=1,)),(*functions[1:],),)returnpipe_aux(InterventionDataForDateNormalization.validate(s.assign(norm_date=None),lazy=True),functions,)