"""Complete pipeline to mine entities, relations, attributes from text."""
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
import spacy
from .relation import REModel, annotate
SPECS = [
"entity",
"entity_type",
"property",
"property_value",
"property_type",
"property_value_type",
"ontology_source",
"paper_id", # article_id:section_name:paragraph_id
"start_char",
"end_char",
]
[docs]def run_pipeline(
texts, model_entities, models_relations, debug=False, excluded_entity_type="NaE"
):
"""Run end-to-end extractions.
Parameters
----------
texts : iterable
The elements in `texts` are tuples where the first element is the text
to be processed and the second element is a dictionary with arbitrary
metadata for the text. Each key in this dictionary will be used to
construct a new column in the output data frame and the values will
appear in the corresponding rows.
Note that if `debug=False` then the output data frame will have
exactly the columns specified by `SPECS`. That means that some
columns produced by the entries in metadata might be dropped, and
some empty columns might be added.
model_entities : spacy.lang.en.English
Spacy model. Note that this model defines entity types.
models_relations : dict
The keys are pairs (two element tuples) of entity types
(i.e. ('GGP', 'CHEBI')). The first entity type is the subject
and the second one is the object. Note that the entity types
should correspond to those inside of `model_entities`. The value
is a list of instances of relation extraction models, that is
instances of some subclass of ``REModel``.
debug : bool
If True, columns are not necessarily matching the specification.
However, they contain debugging information. If False, then
matching exactly the specification.
excluded_entity_type : str or None
If a str, then all entities with type `not_entity_label` will be
excluded. If None, then no exclusion will be taking place.
Returns
-------
pd.DataFrame
The final table. If `debug=True` then it contains all the metadata.
If False then it only contains columns in the official specification.
"""
# sanity checks
if not isinstance(model_entities, spacy.language.Language):
raise TypeError(
"Current implementation requires `model_entities` to be an instance "
"of `spacy.language.Language`. Try for example `model_entities = "
'bluesearch.utils.load_spacy_model("data_and_models/models/ner_er/'
'model-chemical")`.'
)
if not all(
[
isinstance(model, REModel)
for model_list in models_relations.values()
for model in model_list
]
):
raise TypeError(
"Each relation extraction model needs to be a subclass of REModel."
)
if models_relations:
disable_pipe = (
[]
) # parser is needed to split text into sentences, tagger for EntityRuler
else:
disable_pipe = ["parser"]
docs_gen = model_entities.pipe(texts, disable=disable_pipe, as_tuples=True)
lines = []
for doc, metadata in docs_gen:
subtexts = doc.sents if models_relations else [doc]
for subtext in subtexts:
detected_entities = [
ent
for ent in subtext.ents
if excluded_entity_type is None or ent.label_ != excluded_entity_type
]
for s_ent in detected_entities:
# add single lines for entities
lines.append(
dict(
entity=s_ent.text,
entity_type=s_ent.label_,
start_char=s_ent.start_char,
end_char=s_ent.end_char,
**metadata
)
)
# extract relations
for o_ent in detected_entities:
if s_ent == o_ent:
continue
so = (s_ent.label_, o_ent.label_)
if so in models_relations:
for re_model in models_relations[so]:
annotated_sent = annotate(
doc, subtext, s_ent, o_ent, re_model.symbols
)
property_ = re_model.predict(annotated_sent)
lines.append(
dict(
entity=s_ent.text,
entity_type=s_ent.label_,
relation_model=re_model.__class__.__name__,
start_char=s_ent.start_char,
end_char=s_ent.end_char,
property_type="relation",
property=property_,
property_value=o_ent.text,
property_value_type=o_ent.label_,
**metadata
)
)
# enforce columns if there are no extractions or we are in prod mode
if not lines or not debug:
return pd.DataFrame(lines, columns=SPECS)
else:
return pd.DataFrame(lines)