Skip to content

archaeo_super_prompt.types.structured_data

[docs] module archaeo_super_prompt.types.structured_data

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Types related to the wanted structured data in the dataset."""

from collections.abc import Iterator
import datetime
from typing import Any, NamedTuple, Optional, cast
import pandera.pandas as pa
from pandera.typing.pandas import DataFrame, Series
import pandas as pd


def _negativeFloatColumn():
    return pa.Column(pd.Float64Dtype, pa.Check.lt(0.000001), nullable=True)


structuredDataSchema = pa.DataFrameSchema(
    {
        "scheda_intervento.id": pa.Column(int),
        "university.Sigla": pa.Column(str, nullable=True),
        "university.Comune": pa.Column(str, nullable=True),
        "university.Ubicazione": pa.Column(str, nullable=True),
        "university.Indirizzo": pa.Column(str, nullable=True),
        "university.Località": pa.Column(str, nullable=True),
        "university.Data intervento": pa.Column(str, nullable=True),
        "university.Tipo di intervento": pa.Column(str, nullable=True),
        "university.Durata": pa.Column(str, nullable=True),
        "university.Eseguito da": pa.Column(str, nullable=True),
        "university.Direzione scientifica": pa.Column(str, nullable=True),
        "university.Estensione": pa.Column(str, nullable=True),
        "university.Numero di saggi": pa.Column(
            "UInt32", pa.Check.ge(0), nullable=True
        ),
        "university.Profondità massima": _negativeFloatColumn(),
        "university.Geologico": pa.Column("boolean", nullable=True),
        "university.OGD": pa.Column(str, nullable=True),
        "university.OGM": pa.Column(str, nullable=True),
        "university.Profondità falda": _negativeFloatColumn(),
        "check.Preistoria": pa.Column(bool),
        "check.Età Protostorica": pa.Column(bool),
        "check.Età Etrusca": pa.Column(bool),
        "check.Età Romana": pa.Column(bool),
        "check.Età Tardoantica": pa.Column(bool),
        "check.Alto Medioevo": pa.Column(bool),
        "check.Basso Medioevo": pa.Column(bool),
        "check.Età Moderna": pa.Column(bool),
        "check.Età Contemporanea": pa.Column(bool),
        "check.Non identificati": pa.Column(bool),
        "building.Istituzione": pa.Column(str, nullable=True),
        "building.Funzionario competente": pa.Column(str, nullable=True),
        "building.Tipo di documento": pa.Column(str, nullable=True),
        "building.Protocollo": pa.Column(str, nullable=True),
        "building.Data Protocollo": pa.Column(str, nullable=True),
    }
)


class OutputStructuredDataSchema(pa.DataFrameModel):
    """Schema of the intervention target metadata in the dataset."""

    id: Series[int]
    university__Sigla: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Comune: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Ubicazione: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Indirizzo: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Località: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Data_intervento: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Tipo_di_intervento: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Durata: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Eseguito_da: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Direzione_scientifica: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Estensione: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Numero_di_saggi: Optional[Series[pd.UInt32Dtype]] = pa.Field(nullable=True)  # noqa: UP045
    university__Profondità_massima: Optional[Series[pd.Float64Dtype]] = pa.Field(nullable=True)  # noqa: UP045
    university__Geologico: Optional[Series[pd.BooleanDtype]] = pa.Field(nullable=True)  # noqa: UP045
    university__OGD: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__OGM: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    university__Profondità_falda: Optional[Series[pd.Float64Dtype]] = pa.Field(nullable=True)  # noqa: UP045
    building__Istituzione: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    building__Funzionario_competente: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    building__Tipo_di_documento: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    building__Protocollo: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045
    building__Data_Protocollo: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045


class FeaturedOutputStructureDataSchema(OutputStructuredDataSchema):
    """New structure data schema for the new models, with some fields that are normalized."""
    intervention_start_date_min: Optional[datetime.date] = pa.Field(nullable=True)  # noqa: UP045
    intervention_start_date_max: datetime.date
    intervention_start_date_precision: Series[str]
    duration_value: Series[pd.UInt32Dtype] = pa.Field(nullable=True)  # noqa: UP045
    duration_precision: Optional[Series[str]] = pa.Field(nullable=True)  # noqa: UP045


class DatasetAnswerSchema(NamedTuple):
    """Schema of a row in the answer dataframe loadable from the dataset."""

    id: int
    university__Sigla: str | None
    university__Comune: str | None
    university__Ubicazione: str | None
    university__Indirizzo: str | None
    university__Località: str | None
    university__Data_intervento: str | None
    university__Tipo_di_intervento: str | None
    university__Durata: str | None
    university__Eseguito_da: str | None
    university__Direzione_scientifica: str | None
    university__Estensione: str | None
    university__Numero_di_saggi: pd.UInt32Dtype | None
    university__Profondità_massima: float | None
    university__Geologico: bool | None
    university__OGD: str | None
    university__OGM: str | None
    university__Profondità_falda: float | None
    building__Istituzione: str | None
    building__Funzionario_competente: str | None
    building__Tipo_di_documento: str | None
    building__Protocollo: str | None
    building__Data_Protocollo: str | None
    intervention_start_date_min: datetime.date | None
    intervention_start_date_max: datetime.date
    intervention_start_date_precision: str
    duration_value: pd.UInt32Dtype
    duration_precision: str | None


def outputStructuredDataSchema_itertuples(
    df: DataFrame[FeaturedOutputStructureDataSchema],
):
    """Type-safe wrapper of DataFrame.itertuples."""
    return cast(Iterator[DatasetAnswerSchema], df.itertuples())


ExtractedStructuredDataSeries = dict[str, Any]