Skip to content

archaeo_super_prompt.modeling.struct_extract.chunks_to_text

[docs] module archaeo_super_prompt.modeling.struct_extract.chunks_to_text

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Management of the prompt attachment creation."""

from typing import cast
from pandera.typing.pandas import DataFrame
import pandas as pd
from sklearn.pipeline import FunctionTransformer
from ..entity_extractor.types import ChunksWithThesaurus
from .types import InputForExtractionWithSuggestedThesauri
from tqdm import tqdm


def ChunksToText():
    """Unifies the filtered chunks into one attachment text for an LLM prompt.

    This pipeline Transformer applies this chunk merge for each intervention.
    """
    # TODO: define a unique ChunksWithSuggestedValues, regardless if its a
    # thesaurus identifier, an identified number, etc.

    def to_readable_context_string(
        filtered_chunks: DataFrame[ChunksWithThesaurus],
    ) -> str:
        msg: str = ""
        for _, chunk in filtered_chunks.sort_values(
            by="chunk_index"
        ).iterrows():
            msg += f"`%% {chunk['filename']} | Page {chunk['chunk_page_position']} ({chunk['chunk_type']}) %%`\n\n"
            msg += chunk["chunk_content"] + "\n" * 2
            msg += "`" + "-" * 60 + "`\n\n"
        return msg

    def unify_thesaurus(X: DataFrame[ChunksWithThesaurus]):
        return set().union(*X["identified_thesaurus"].tolist())

    def ChunksToPromptContent(
        X: DataFrame[ChunksWithThesaurus],
    ) -> DataFrame[InputForExtractionWithSuggestedThesauri]:
        return InputForExtractionWithSuggestedThesauri.validate(
            pd.DataFrame(
                (
                    lambda filtered_chunks: {
                        "id": id_,
                        "merged_chunks": to_readable_context_string(
                            filtered_chunks
                        ),
                        "identified_thesaurus": list(
                            unify_thesaurus(filtered_chunks)
                        ),
                    }
                )(cast(DataFrame[ChunksWithThesaurus], filtered_chunks))
                for id_, filtered_chunks in tqdm(
                    X.groupby("id"),
                    "Gathering filtered chunks",
                    unit="intervention",
                )
            ).set_index("id")
        )

    return FunctionTransformer(ChunksToPromptContent)