archaeo_super_prompt.utils.cache

[docs] module archaeo_super_prompt.utils.cache
"""Cache feature management."""from pathlib importPathfrom typing import(Any,Literal,TypeVar,cast,)from collections.abc importCallable,Iteratorfrom joblib importMemoryfrom joblib.memory importMemorizedFunc_CACHE_DIR=(Path(__file__).parent/"../../../data/").resolve()CacheSubpart=Literal["external","interim","processed","raw"]_memories:dict[CacheSubpart,Memory]={k:Memory(str(_CACHE_DIR/k),verbose=0)forkincast(tuple[CacheSubpart,...],("external","interim","processed"))}def get_cache_dir_for(cache_subpart:CacheSubpart,subpart:str):[docs]
    """Return a path object pointing to a subdir of the given "/data" directory."""subdir=_CACHE_DIR.joinpath(cache_subpart,subpart)ifnotsubdir.exists():subdir.mkdir(parents=True)returnsubdirdef get_memory_for(cache_subpart:CacheSubpart):[docs]
    """Get the joblib cache memory related to a subpath of the "/data" directory."""return_memories[cache_subpart]## Manual cachingdef identity_function(input:Any,output_to_be_cached:Any|None):[docs]
    """Identity function."""input=inputreturnoutput_to_be_cacheddef is_input_in_the_cache(identity_function:MemorizedFunc,input:Any):[docs]
    """Return if the input has already an output saved in the cache."""ifnotidentity_function.check_call_in_cache(input):returnFalsereturnidentity_function(input)isnotNonedef manually_cache_result([docs]
identity_function:MemorizedFunc,input:Any,output:Any):    """Manually save the input and its output in the joblib's cache.    Arguments:        identity_function: a dummy cached function to carry out the joblib \cache mechanism, built from a wrapping of the identity_function function given \by the module. The funtion must ignore the output argument in the caching        input: a hashable input        output: the value to be saved in the cache    """identity_function.call(input,output)HashedT=TypeVar("HashedT")Output=TypeVar("Output")CacheIngestorFunction=Callable[[HashedT,Output|None],Output|None]"""The name of the arguments is important: the output argument must be exactlynamed 'output'"""def escape_expensive_run_when_cached[Input,HashedT,Output]([docs]
named_id_func:CacheIngestorFunction[HashedT,Output],memory:Memory,input_hash_function:Callable[[Input],HashedT],expensive_function:Callable[[Iterator[Input]],Iterator[Output]],input_iter:Iterator[Input],):    """TODO: comment.    Arguments:        named_id_func: a function defined like this (input, output) -> output \output can be None        memory: TODO        input_hash_function: TODO        expensive_function: TODO        input_iter: TODO    """identity_function=cast(MemorizedFunc,memory.cache(named_id_func,ignore=["output"]))cached_fn=cast(CacheIngestorFunction[HashedT,Output],identity_function)results:list[tuple[Input,Output|None]]=[]inputs_to_be_processed:list[Input]=[]forinptininput_iter:hashed_inpt=input_hash_function(inpt)result=(cached_fn(hashed_inpt,None)ifis_input_in_the_cache(identity_function,hashed_inpt)elseNone)results.append((inpt,result))ifresultisNone:inputs_to_be_processed.append(inpt)new_results=expensive_function(iter(inputs_to_be_processed))forinpt,resultinresults:ifresultisNone:hashed_inpt=input_hash_function(inpt)try:new_result=next(new_results)# just pass to this identity function to save it in the cachemanually_cache_result(identity_function,hashed_inpt,new_result)yieldinpt,new_resultcontinueexceptStopIteration:raiseException(f"The function {named_id_func.__name__} has missed some results to be produced")yieldinpt,resultdef manualy_cache_batch_processing[Input,Output]([docs]
path_for_input:Callable[[Input],Path],cache_on_disk:Callable[[Output,Path],None],load_output_from_cache:Callable[[Path],Output],expensive_function:Callable[[Iterator[Input]],Iterator[Output]],input_iter:Iterator[Input],)->Iterator[tuple[Input,Output]]:    """Lazily execute an expensive function taking a batch of inputs with cache.    Execute an expensive function taking a batch of inputs, with escaping    all the inputs of the batch whose the output is already saved in the cache.    """results_from_current_cache_only=[(inpt,(pifp.exists()elseNone))forinpt,pinmap(lambdainpt:(inpt,path_for_input(inpt)),input_iter)]not_cached_yet_inputs=(inptforinpt,opt_outputinresults_from_current_cache_onlyifopt_outputisNone)new_results=expensive_function(not_cached_yet_inputs)def put_in_cache_and_return(input:Input,output:Output):cache_on_disk(output,path_for_input(input))returnoutputreturn((inpt,load_output_from_cache(opt_output)ifopt_outputisnotNoneelseput_in_cache_and_return(inpt,next(new_results)),)forinpt,opt_outputinresults_from_current_cache_only)