archaeo_super_prompt.modeling.entity_extractor.fuzzy_match

[docs] module archaeo_super_prompt.modeling.entity_extractor.fuzzy_match
"""Identification of thesaurus with fuzzymatching in text chunks."""from collections.abc importIteratorfrom fuzzysearch importfind_near_matches,Matchfrom thefuzz importfuzzfrom .types importCompleteEntity,ThesaurusProviderfrom ...utils importcachedef extended_expression(content:str,match:Match)->str:[docs]
    """Return the extended expression around a given match.    Examples:    "WE ARE IN PONTEDERA", "PONTE" -> "PONTEDERA"    "WE ARE IN AN APPARTEMENT", "PART" -> "APPARTEMENT"    "WE ARE IN AN APPARTEMENT", "APPARTEMENT" -> "APPARTEMENT"    "I am working for the Soprintendenza Archeologica della Toscana", "Soprintendenza Archeologica della Toscana" -> "Soprintendenza Archeologica della Toscana"    "I am working for the Soprintendenza Archeologica della Toscana", "intendenza Archeologica della Toscana" -> "Soprintendenza Archeologica della Toscana"    """content_length=len(content)extended_start=match.startifcontent[extended_start].isalnum():whileextended_start>0andcontent[extended_start-1].isalnum():extended_start-=1extended_end=match.endifcontent[extended_end-1].isalnum():while(extended_end<content_lengthandcontent[extended_end].isalnum()):extended_end+=1returncontent[extended_start:extended_end]def filter_occurences([docs]
content:str,thesaurus_value:str,matches:list[Match])->list[Match]:    """Keep the matches whose extended expression still match with the thesarusus value.    For example, if "PART" is detected in the content "WE ARE IN AN APPARTEMENT", then this match will be excluded.    """def filter_empty_word_matches(matches:list[Match]):return[mforminmatchesifm.matched!=""]f=[matchformatchinfilter_empty_word_matches(matches)# the levenstein distance will augment if the extended_expression is# too much longer, so the ratio will decreaseiffuzz.ratio(extended_expression(content,match),thesaurus_value)>80]returnf@cache.get_memory_for("interim").cachedef extract_from_content([docs]
content:str,entity_set:list[CompleteEntity],wanted_entities:list[tuple[int,str]],)->set[int]|None:    """We expect the wanted entities and the content to be normalized."""ifnotentity_set:returnNonereturnset(thesaurus_idforthesaurus_id,thesaurus_valueinwanted_entitiesiffilter_occurences(content,thesaurus_value,find_near_matches(thesaurus_value,content,max_l_dist=2),))def normalize_text(txt:str)->str:[docs]
    """Apply simple normalization to make the comparison easier."""returntxt.lower()def extract_wanted_entities([docs]
chunk_contents:Iterator[str],complete_entity_sets:Iterator[list[CompleteEntity]],thesauri_factory:ThesaurusProvider,)->Iterator[set[int]|None]:    """Filter only the entities that fuzzymatch with wanted thesaurus.    Arguments:        chunk_contents: for each chunk, its text content        complete_entity_sets: a set for each text chunk of occurring entities \only in a group of entity types        thesauri_factory: a set of wanted string values to be extracted in the \same group of entity types    ReturnType:    A list for each text chunk of the matched thesaurus above the given distance treshold. If there is not any filtered entity for a given chunk, then None is returned for this chunk instead of the empty set.    The empty set means that the chunk contains entities that match the group    of entities of interests but these entities does not match the thesaurus.    """load_and_normalized_thesauri=[(thesaurus_id,normalize_text(thesaurus_value))forthesaurus_id,thesaurus_valueinthesauri_factory()]return(extract_from_content(normalize_text(content),entity_set,load_and_normalized_thesauri)forcontent,entity_setinzip(chunk_contents,complete_entity_sets,strict=True))