Source code for bluesearch.mining.pipeline

"""Complete pipeline to mine entities, relations, attributes from text."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import pandas as pd
import spacy

from .relation import REModel, annotate

SPECS = [
    "entity",
    "entity_type",
    "property",
    "property_value",
    "property_type",
    "property_value_type",
    "ontology_source",
    "paper_id",  # article_id:section_name:paragraph_id
    "start_char",
    "end_char",
]


[docs]def run_pipeline( texts, model_entities, models_relations, debug=False, excluded_entity_type="NaE" ): """Run end-to-end extractions. Parameters ---------- texts : iterable The elements in `texts` are tuples where the first element is the text to be processed and the second element is a dictionary with arbitrary metadata for the text. Each key in this dictionary will be used to construct a new column in the output data frame and the values will appear in the corresponding rows. Note that if `debug=False` then the output data frame will have exactly the columns specified by `SPECS`. That means that some columns produced by the entries in metadata might be dropped, and some empty columns might be added. model_entities : spacy.lang.en.English Spacy model. Note that this model defines entity types. models_relations : dict The keys are pairs (two element tuples) of entity types (i.e. ('GGP', 'CHEBI')). The first entity type is the subject and the second one is the object. Note that the entity types should correspond to those inside of `model_entities`. The value is a list of instances of relation extraction models, that is instances of some subclass of ``REModel``. debug : bool If True, columns are not necessarily matching the specification. However, they contain debugging information. If False, then matching exactly the specification. excluded_entity_type : str or None If a str, then all entities with type `not_entity_label` will be excluded. If None, then no exclusion will be taking place. Returns ------- pd.DataFrame The final table. If `debug=True` then it contains all the metadata. If False then it only contains columns in the official specification. """ # sanity checks if not isinstance(model_entities, spacy.language.Language): raise TypeError( "Current implementation requires `model_entities` to be an instance " "of `spacy.language.Language`. Try for example `model_entities = " 'bluesearch.utils.load_spacy_model("data_and_models/models/ner_er/' 'model-chemical")`.' ) if not all( [ isinstance(model, REModel) for model_list in models_relations.values() for model in model_list ] ): raise TypeError( "Each relation extraction model needs to be a subclass of REModel." ) if models_relations: disable_pipe = ( [] ) # parser is needed to split text into sentences, tagger for EntityRuler else: disable_pipe = ["parser"] docs_gen = model_entities.pipe(texts, disable=disable_pipe, as_tuples=True) lines = [] for doc, metadata in docs_gen: subtexts = doc.sents if models_relations else [doc] for subtext in subtexts: detected_entities = [ ent for ent in subtext.ents if excluded_entity_type is None or ent.label_ != excluded_entity_type ] for s_ent in detected_entities: # add single lines for entities lines.append( dict( entity=s_ent.text, entity_type=s_ent.label_, start_char=s_ent.start_char, end_char=s_ent.end_char, **metadata ) ) # extract relations for o_ent in detected_entities: if s_ent == o_ent: continue so = (s_ent.label_, o_ent.label_) if so in models_relations: for re_model in models_relations[so]: annotated_sent = annotate( doc, subtext, s_ent, o_ent, re_model.symbols ) property_ = re_model.predict(annotated_sent) lines.append( dict( entity=s_ent.text, entity_type=s_ent.label_, relation_model=re_model.__class__.__name__, start_char=s_ent.start_char, end_char=s_ent.end_char, property_type="relation", property=property_, property_value=o_ent.text, property_value_type=o_ent.label_, **metadata ) ) # enforce columns if there are no extractions or we are in prod mode if not lines or not debug: return pd.DataFrame(lines, columns=SPECS) else: return pd.DataFrame(lines)