Skip to content

archaeo_super_prompt.modeling.pdf_to_text.document_division

[docs] module archaeo_super_prompt.modeling.pdf_to_text.document_division

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Utility functions to divide the pages of a PDF document into slices."""

from docling.datamodel.settings import PageRange


def get_page_ranges(
    doc_page_number: int,
    page_batch_size: int,
    border_page_nb: int | None = None,
) -> list[PageRange]:
    """Divide a number of pages into batch intervals.

    If only the header and the footer of the document are wanted, then
    only divide the first pages and the last pages into batch intervals. Set
    the argument border_page_nb to trigger such a behaviour.

    The number of page in a batch is set according to the number of page
    the remote LLM is able to process in parallel.

    Arguments:
        doc_page_number: the total number of pages in the document
        page_batch_size: the number of pages in a slice
        border_page_nb: if given, only keep this number of page from the start and from the end (so 2*border_page_nb) will be processed with the output ranges
    """

    def split_into_batch_page_range(start_page: int, end_page: int):
        return [
            (i, min(i + page_batch_size - 1, end_page))
            for i in range(
                start_page,
                min(end_page, doc_page_number) + 1,
                page_batch_size,
            )
        ]

    def get_start_and_end_pages(border_page_nb: int):
        if doc_page_number < 2 * border_page_nb:
            return split_into_batch_page_range(1, doc_page_number)
        return [
            *split_into_batch_page_range(1, border_page_nb),
            *split_into_batch_page_range(
                doc_page_number - border_page_nb + 1, doc_page_number
            ),
        ]

    if border_page_nb is not None:
        return get_start_and_end_pages(border_page_nb)
    return split_into_batch_page_range(1, doc_page_number)