Skip to content

archaeo_super_prompt.modeling.chunk_selector

[docs] module archaeo_super_prompt.modeling.chunk_selector

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Utils to select boundaries page of interest in documents."""

import math
import functools as fnt
from typing import cast

from ..types.pdfchunks import PDFChunkPerInterventionDataset


def _get_reasonable_page_number(chunkDataset: PDFChunkPerInterventionDataset):
    MAX_SELECTABLE_PAGE_NUMBER = 3
    total_page_number = max(
        fnt.reduce(
            lambda flat, lst: [*flat, *lst],
            cast(
                list[list[int]],
                chunkDataset.data["chunk_page_position"].to_list(),
            ),
            cast(list[int], []),
        )
    )
    max_selected_page_number = min(
        MAX_SELECTABLE_PAGE_NUMBER, math.ceil(0.1 * total_page_number)
    )
    return max_selected_page_number, total_page_number


def select_incipit(chunkDataset: PDFChunkPerInterventionDataset):
    """Select only the chunks of the first pages of the document."""
    max_selected_page_number, _ = _get_reasonable_page_number(chunkDataset)
    return PDFChunkPerInterventionDataset(
        chunkDataset.data[
            chunkDataset.data["chunk_page_position"].apply(min)
            < max_selected_page_number
        ]
    )


def select_end_pages(chunkDataset: PDFChunkPerInterventionDataset):
    """Select only the chunks of the last pages of the document."""
    max_selected_page_number, total_page_number = _get_reasonable_page_number(
        chunkDataset
    )
    return PDFChunkPerInterventionDataset(
        chunkDataset.data[
            chunkDataset.data["chunk_page_position"].apply(max)
            > total_page_number - max_selected_page_number
        ]
    )