archaeo_super_prompt.dataset.thesauri.comune_province
[docs]
module
archaeo_super_prompt.dataset.thesauri.comune_province
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83 | """Loading of thesauri related to the comune and the province."""
from typing import NamedTuple
import pandas as pd
from pandera.pandas import DataFrameModel
from pandera.typing.pandas import DataFrame, Index, Series
from ...utils.cache import get_cache_dir_for
def _get_comune_file():
return get_cache_dir_for("raw", "thesaurus") / "comune.csv"
def _get_provincie_file():
return get_cache_dir_for("raw", "thesaurus") / "provincie.csv"
def load_comune() -> list[tuple[int, str]]:
"""Load the thesarus values for the "Comune" field."""
df = pd.read_csv(_get_comune_file())
return list(
(id_, nome)
for _, id_, nome in df[["id_com", "nome"]][
df["nome"].notnull()
].itertuples()
)
class Provincia(NamedTuple):
"""Exhaustive data about a Province."""
id_: int
name: str
sigla: str
class ComuneProvincia(NamedTuple):
"""Exhaustive data about a Comune."""
comune: str # the name and the id
provincia: Provincia
class ComuneData(DataFrameModel):
"""Data about a Comune."""
comune_id: Index[int]
name: Series[str]
province_id: Series[int]
class ProvinciaData(DataFrameModel):
"""Data about a Province."""
province_id: Index[int]
name: Series[str]
sigla: Series[str] # 2-chars
def load_comune_with_provincie() -> tuple[
DataFrame[ComuneData], DataFrame[ProvinciaData]
]:
"""Load the set of provincie thesaurus from an external reference table."""
comune = pd.read_csv(_get_comune_file())
province = pd.read_csv(_get_provincie_file(), keep_default_na=False)
return ComuneData.validate(
comune[comune["nome"].notnull() & comune["provincia"].notnull()][
["id_com", "nome", "provincia"]
]
.rename(
columns={
"id_com": "comune_id",
"nome": "name",
"provincia": "province_id",
}
)
.set_index("comune_id")
), ProvinciaData.validate(
province[["id_prov", "nome", "sigla"]]
.rename(columns={"id_prov": "province_id", "nome": "name"})
.set_index("province_id")
)
|