Skip to content

archaeo_super_prompt.modeling.struct_extract.extractors.archiving_date

[docs] package archaeo_super_prompt.modeling.struct_extract.extractors.archiving_date

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Mock archiving extractor with the good data.

As a model for extracting this data is not done, for now
we assume this data as known and simulate this behaviour
with loading from the dataset.
"""

import datetime
from typing import Any, cast, override

import pandas as pd
from pandera.typing.pandas import DataFrame, Series

from .....dataset.load import MagohDataset
from .....types.intervention_id import InterventionId
from .....types.pdfchunks import PDFChunkDataset
from .....types.per_intervention_feature import (
    BasePerInterventionFeatureSchema,
)
from ....types.detailed_evaluator import DetailedEvaluatorMixin


class ArchivingDateOutputSchema(BasePerInterventionFeatureSchema):
    """when indentifying the date of an intervention, we refer first to the date of protocol."""

    data_protocollo: datetime.date


class ArchivingDateProvider(DetailedEvaluatorMixin[Any, MagohDataset, Any]):
    """Give the answer of the ArchivingDate."""

    def __init__(self) -> None:
        """."""
        super().__init__()
        self._mds: MagohDataset | None = None

    @override
    def fit(self, X, y: MagohDataset, **kwargs):
        X = X  # unused
        kwargs = kwargs  # unused
        self._mds = y
        return self

    def filter_ids(self, y: MagohDataset, ids: set[InterventionId]):
        """Only keeps the records with an inserted archiving date."""
        return y.filter_good_records_for_training(
            ids,
            lambda df: cast(
                Series[bool], df["building__Data_Protocollo"].notnull()
            ),
        )

    @override
    def predict(
        self,
        X: PDFChunkDataset,
    ) -> DataFrame[ArchivingDateOutputSchema]:
        if self._mds is None:
            raise NotImplementedError(
                "Cannot infer the data of archiving. Please fit the model with the dataset so the answers can be output."
            )

        def to_date(dp: str):
            d, m, y = [int(k) for k in dp.strip().split("-")]
            return datetime.date(y, m, d)

        return ArchivingDateOutputSchema.validate(
            pd.DataFrame(
                [
                    {
                        "id": a.id,
                        "data_protocollo": to_date(
                            str(a.building__Data_Protocollo)
                        ),
                    }
                    for a in self._mds.get_answers(set(X["id"].to_list()))
                ]
            ).set_index("id")
        )

    @override
    def score(
        self, X, y, sample_weight=None
    ) -> float:
        X = X  # unused
        y = y  # unused
        sample_weight = sample_weight  # unused
        return 1.0

    @override
    def score_and_transform(self, X, y):
        return self.score(X, y), self.predict(X)