archaeo_super_prompt.dataset.minio_engine
[docs]
module
archaeo_super_prompt.dataset.minio_engine
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78 | from typing import cast
from minio import Minio
from pathlib import Path
import re
from ..utils.cache import get_cache_dir_for
from ..config.env import getenv_or_throw
_host = getenv_or_throw("MINIO_HOST")
_user = getenv_or_throw("MINIO_ROOT_USER")
_password = getenv_or_throw("MINIO_ROOT_PASSWORD")
__client = None
BUCKET_NAME = "training-reports"
def _init_client():
c = Minio(
_host,
access_key=_user,
secret_key=_password,
secure=not _host.startswith("localhost"),
)
if not c.bucket_exists(BUCKET_NAME):
c.make_bucket(BUCKET_NAME)
return c
# Allow only letters, digits, underscores, hyphens, and dots
SAFE_FILENAME_PATTERN = re.compile(r"[^a-zA-Z0-9_.-]+") # MATCHES UNSAFE chars
def sanitize_filename(filename):
return SAFE_FILENAME_PATTERN.sub(
"_", filename
) # Replace unsafe chars with underscore
def download_files(intervention_id: int) -> list[Path]:
global __client
pdf_store_dir = get_cache_dir_for("external", "pdfs")
if not pdf_store_dir.exists():
pdf_store_dir.mkdir(parents=True, exist_ok=True)
dirpath = Path("./" + str(intervention_id))
output_pathdir = pdf_store_dir / dirpath
if output_pathdir.exists():
files = [f for f in output_pathdir.iterdir()]
if files:
return files
if __client is None:
__client = _init_client()
files = __client.list_objects(
BUCKET_NAME, prefix=str(dirpath), recursive=True
)
def download_and_return():
for file in files:
object_name = file.object_name
if object_name is None:
continue
it_id_subdir, filename = object_name.split("/")
output_path = (pdf_store_dir / it_id_subdir) / sanitize_filename(
filename
)
_ = (
cast(Minio, __client).fget_object(
BUCKET_NAME, object_name, str(output_path)
),
)
yield output_path
return [p for p in download_and_return()]
|