1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133 | """Types related to the wanted structured data in the dataset."""
from collections.abc import Iterator
import datetime
from typing import Any, NamedTuple, Optional, cast
import pandera.pandas as pa
from pandera.typing.pandas import DataFrame, Series
import pandas as pd
def _negativeFloatColumn():
return pa.Column(pd.Float64Dtype, pa.Check.lt(0.000001), nullable=True)
structuredDataSchema = pa.DataFrameSchema(
{
"scheda_intervento.id": pa.Column(int),
"university.Sigla": pa.Column(str, nullable=True),
"university.Comune": pa.Column(str, nullable=True),
"university.Ubicazione": pa.Column(str, nullable=True),
"university.Indirizzo": pa.Column(str, nullable=True),
"university.Località": pa.Column(str, nullable=True),
"university.Data intervento": pa.Column(str, nullable=True),
"university.Tipo di intervento": pa.Column(str, nullable=True),
"university.Durata": pa.Column(str, nullable=True),
"university.Eseguito da": pa.Column(str, nullable=True),
"university.Direzione scientifica": pa.Column(str, nullable=True),
"university.Estensione": pa.Column(str, nullable=True),
"university.Numero di saggi": pa.Column(
"UInt32", pa.Check.ge(0), nullable=True
),
"university.Profondità massima": _negativeFloatColumn(),
"university.Geologico": pa.Column("boolean", nullable=True),
"university.OGD": pa.Column(str, nullable=True),
"university.OGM": pa.Column(str, nullable=True),
"university.Profondità falda": _negativeFloatColumn(),
"check.Preistoria": pa.Column(bool),
"check.Età Protostorica": pa.Column(bool),
"check.Età Etrusca": pa.Column(bool),
"check.Età Romana": pa.Column(bool),
"check.Età Tardoantica": pa.Column(bool),
"check.Alto Medioevo": pa.Column(bool),
"check.Basso Medioevo": pa.Column(bool),
"check.Età Moderna": pa.Column(bool),
"check.Età Contemporanea": pa.Column(bool),
"check.Non identificati": pa.Column(bool),
"building.Istituzione": pa.Column(str, nullable=True),
"building.Funzionario competente": pa.Column(str, nullable=True),
"building.Tipo di documento": pa.Column(str, nullable=True),
"building.Protocollo": pa.Column(str, nullable=True),
"building.Data Protocollo": pa.Column(str, nullable=True),
}
)
class OutputStructuredDataSchema(pa.DataFrameModel):
"""Schema of the intervention target metadata in the dataset."""
id: Series[int]
university__Sigla: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Comune: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Ubicazione: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Indirizzo: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Località: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Data_intervento: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Tipo_di_intervento: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Durata: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Eseguito_da: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Direzione_scientifica: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Estensione: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Numero_di_saggi: Optional[Series[pd.UInt32Dtype]] = pa.Field(nullable=True) # noqa: UP045
university__Profondità_massima: Optional[Series[pd.Float64Dtype]] = pa.Field(nullable=True) # noqa: UP045
university__Geologico: Optional[Series[pd.BooleanDtype]] = pa.Field(nullable=True) # noqa: UP045
university__OGD: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__OGM: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
university__Profondità_falda: Optional[Series[pd.Float64Dtype]] = pa.Field(nullable=True) # noqa: UP045
building__Istituzione: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
building__Funzionario_competente: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
building__Tipo_di_documento: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
building__Protocollo: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
building__Data_Protocollo: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
class FeaturedOutputStructureDataSchema(OutputStructuredDataSchema):
"""New structure data schema for the new models, with some fields that are normalized."""
intervention_start_date_min: Optional[datetime.date] = pa.Field(nullable=True) # noqa: UP045
intervention_start_date_max: datetime.date
intervention_start_date_precision: Series[str]
duration_value: Series[pd.UInt32Dtype] = pa.Field(nullable=True) # noqa: UP045
duration_precision: Optional[Series[str]] = pa.Field(nullable=True) # noqa: UP045
class DatasetAnswerSchema(NamedTuple):
"""Schema of a row in the answer dataframe loadable from the dataset."""
id: int
university__Sigla: str | None
university__Comune: str | None
university__Ubicazione: str | None
university__Indirizzo: str | None
university__Località: str | None
university__Data_intervento: str | None
university__Tipo_di_intervento: str | None
university__Durata: str | None
university__Eseguito_da: str | None
university__Direzione_scientifica: str | None
university__Estensione: str | None
university__Numero_di_saggi: pd.UInt32Dtype | None
university__Profondità_massima: float | None
university__Geologico: bool | None
university__OGD: str | None
university__OGM: str | None
university__Profondità_falda: float | None
building__Istituzione: str | None
building__Funzionario_competente: str | None
building__Tipo_di_documento: str | None
building__Protocollo: str | None
building__Data_Protocollo: str | None
intervention_start_date_min: datetime.date | None
intervention_start_date_max: datetime.date
intervention_start_date_precision: str
duration_value: pd.UInt32Dtype
duration_precision: str | None
def outputStructuredDataSchema_itertuples(
df: DataFrame[FeaturedOutputStructureDataSchema],
):
"""Type-safe wrapper of DataFrame.itertuples."""
return cast(Iterator[DatasetAnswerSchema], df.itertuples())
ExtractedStructuredDataSeries = dict[str, Any]
|