archaeo_super_prompt.modeling.entity_extractor.model

[docs] module archaeo_super_prompt.modeling.entity_extractor.model
"""Core functions for inferring and filtering named entities in chunks."""import itertoolsfrom typing importcastimport requestsfrom tqdm importtqdmfrom ...config.env importgetenv_or_throwfrom .types importCompleteEntity,NerOutput,NerXXLEntitiesdef _fetch_entities(ner_model_hosturl:str,chunks:list[str])->list[list[NerOutput]]:ifnotchunks:return[]print("Fetching the transformers model")payload={"chunks":chunks}response=requests.post(f"{ner_model_hosturl}/ner",json=payload,timeout=60)response.raise_for_status()entities=list(map(lambdalst:list(map(lambdadct:NerOutput(**dct),lst)),cast(list[list[dict]],response.json()),))returnentitiesdef fetch_entities(chunks:list[str]):[docs]
    """Infer into the remote NER model to find named entities in each chunk."""ner_model_hosturl=getenv_or_throw("NER_MODEL_HOST_URL")returnlist(itertools.chain.from_iterable(_fetch_entities(ner_model_hosturl,list(c))forcintqdm(itertools.batched(chunks,50),desc="NER analysing",unit="Fraction of total text chunks",total=len(chunks)//50+int(len(chunks)%50!=0),)))[docs]
def gatherEntityChunks(entity_chunks:list[NerOutput],confidence_treshold:float):    """Gather the chunk of entity output from one text chunk."""entity_set:list[CompleteEntity]=list()current_accumulated_entity:CompleteEntity|None=Noneforcurrent_entity_chunkinentity_chunks:# Edge-case when a chunks is under the confidence treshold# We only keep the already added confident chunk of the entity# and ignore the following chunksifcurrent_entity_chunk.score<confidence_treshold:ifcurrent_accumulated_entityisnotNone:entity_set.append(current_accumulated_entity)current_accumulated_entity=Nonecontinueifcurrent_entity_chunk.entity.startswith("B-"):# Start a new entity with B- entitiesifcurrent_accumulated_entityisnotNone:entity_set.append(current_accumulated_entity)current_accumulated_entity=CompleteEntity(entity=cast(NerXXLEntities,current_entity_chunk.entity[2:]),word=current_entity_chunk.word,start=current_entity_chunk.start,end=current_entity_chunk.end,)elif(current_accumulated_entityisnotNone# the condition below allows entities of the same type that# are consecutive or separated by one space to be merged# WARN: it is expected that the output content of the ner model# is normalized so words are only separated by 1 space at# maximumandabs(current_entity_chunk.start-current_accumulated_entity.end)<=1):current_accumulated_entity.end=current_entity_chunk.end# Complete an entity with its additional chunksifcurrent_entity_chunk.word.startswith("##"):# the chunk belongs to the same entity wordcurrent_accumulated_entity.word+=current_entity_chunk.word[2:]else:# the entity is composed of several wordscurrent_accumulated_entity.word+=(" "+current_entity_chunk.word)returnentity_setdef postrocess_entities([docs]
entitiesPerTextChunk:list[list[NerOutput]],confidence_treshold:float):    """Return a set of the occured entities for each chunks.    Arguments:        entitiesPerTextChunk: for each chunk, a list of its retrieved \entities ordered by their occurence in the chunk's text content        confidence_treshold: a treshold between 0 and 1 to tolerate only a \subset of entities    """return[gatherEntityChunks(entity_chunks,confidence_treshold)forentity_chunksinentitiesPerTextChunk]def filter_entities([docs]
complete_entity_sets:list[list[CompleteEntity]],# List[Set[CompleteEntity]]allowed_entities:set[NerXXLEntities],)->list[list[CompleteEntity]]:# List[Set[CompleteEntity]]    """For each text chunk, keep only the entities included in the given group of allowed entity types."""return[list(filter(lambdae:e.entityinallowed_entities,s))forsincomplete_entity_sets]