Skip to content

archaeo_super_prompt.dataset.normalization.intervention_date.month_normalization

[docs] module archaeo_super_prompt.dataset.normalization.intervention_date.month_normalization

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Code to transform a noisy month string into a numerical month."""

from thefuzz import process

ITALIAN_MONTHS = list(
    map(
        lambda m: m.lower(),
        [
            "Gennaio",
            "Febbraio",
            "Marzo",
            "Aprile",
            "Maggio",
            "Giugno",
            "Luglio",
            "Agosto",
            "Settembre",
            "Ottobre",
            "Novembre",
            "Dicembre",
        ],
    )
)


def to_int_month(month_str: str) -> int:
    """Convert a string of month into its integer format."""
    if month_str.isdigit():
        return int(month_str)
    norm = month_str.lower().strip()
    if norm in ITALIAN_MONTHS:
        return ITALIAN_MONTHS.index(norm) + 1
    best_month_list: list[tuple[str, int]] = process.extractBests(
        norm, ITALIAN_MONTHS, limit=1, score_cutoff=90
    )
    if len(best_month_list) == 0:
        raise Exception(f"Cannot parse this month: '{month_str}'")
    return ITALIAN_MONTHS.index(best_month_list[0][0]) + 1