archaeo_super_prompt.dataset.postgresql_engine

[docs] module archaeo_super_prompt.dataset.postgresql_engine
"""Structured data loading from a remote dataset.This module manages the interaction with the postgresql database to load apandas DataFrame. The sqlalchemy library with the psycopg2 engine are used."""import pandas as pdfrom pathlib importPathfrom sqlalchemy importEngine,create_enginefrom ..config.env importgetenv_or_throwdef _create_engine_from_credentials():DIALECT="postgresql"DRIVER="psycopg2"writing_db_user=getenv_or_throw("PG_SUPERUSER")db_name=getenv_or_throw("PG_DB_NAME")db_user_password=getenv_or_throw("PG_DB_PASSWORD")db_host=getenv_or_throw("PG_DB_HOST")db_port=getenv_or_throw("PG_DB_PORT")returncreate_engine(f"{DIALECT}+{DRIVER}://{writing_db_user}:{db_user_password}@{db_host}:{db_port}/{db_name}")__engine:Engine|None=Nonedef _get_engine():global__engineif__engineisNone:__engine=_create_engine_from_credentials()return__enginedef _import_sql(sql_path:Path):withsql_path.open("r")assql_file:returnsql_file.read()__module_dir=Path(__file__).parent__seed_setting_request=_import_sql(__module_dir/Path("sql/setseed.sql"))__sampling_request=_import_sql(__module_dir/Path("sql/sampling.sql"))__sampling_on_recents_request=_import_sql(__module_dir/Path("sql/sampling_on_recents.sql"))__get_sample_findings_request=_import_sql(__module_dir/Path("sql/sample_findings.sql"))__get_intervention_with_ids=_import_sql(__module_dir/Path("sql/select_ids.sql"))__get_findings_with_ids=_import_sql(__module_dir/Path("./sql/select_findings_ids.sql"))[docs]
def get_entries(max_number:int,seed:float,only_recent_entries=False):    """Fetch from the remote database a set of samples of interventions."""engine=_get_engine()findings_request=__get_sample_findings_request.replace("-- sampling-placeholder",__sampling_requestifnotonly_recent_entrieselse__sampling_on_recents_request,)deterministic_params={"seed":seed,"max_number":max_number}print("Fetching structured intervention data...")intervention_data=pd.read_sql(__seed_setting_request+"\n"+__sampling_request,engine,params=deterministic_params,)print("Fetching done!")print("Fetching saved findings for each intervention...")findings=pd.read_sql(__seed_setting_request+"\n"+findings_request,engine,params=deterministic_params,)print("Fetching done!")returnintervention_data,findingsdef get_entries_with_ids(ids:set[int]):[docs]
    """Fetch on the db the metadata of the intervention with the given ids."""engine=_get_engine()id_set_for_request=tuple(ids)interventions=pd.read_sql(__get_intervention_with_ids,engine,params={"intervention_ids":id_set_for_request},)findings=pd.read_sql(__get_findings_with_ids,engine,params={"intervention_ids":id_set_for_request})returninterventions,findings