Source code for bluesearch.mining.entity

"""Classes and functions for entity extraction (aka named entity recognition)."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import ast
import copy

import numpy as np
import pandas as pd
import spacy

from ..utils import JSONL


[docs]class PatternCreator: """Utility class for easy handling of patterns. Parameters ---------- storage : None or pd.DataFrame If provided, we automatically populate `_storage` with it. If None, then we start from scratch - no patterns. Attributes ---------- _storage : pd.DataFrame A representation of all patterns allows for comfortable sorting, filtering, etc. Note that each row represents a single pattern. Examples -------- >>> from bluesearch.mining import PatternCreator >>> >>> pc = PatternCreator() >>> pc.add("FOOD", [{"LOWER": "oreo"}]) >>> pc.add("DRINK", [{"LOWER": {"REGEX": "^w"}}, {"LOWER": "milk"}]) >>> doc = pc("It is necessary to dip the oreo in warm milk!") >>> [(str(e), e.label_) for e in doc.ents] [('oreo', 'FOOD'), ('warm milk', 'DRINK')] """ def __init__(self, storage=None): if storage is None: columns = ["label"] self._storage = pd.DataFrame(columns=columns) else: self._storage = storage.reset_index(drop=True) def __call__(self, text, model=None, disable=None, **add_pipe_kwargs): """Test the current patterns on text. Parameters ---------- text : str Some text. model : spacy.language.Language or None Spacy model. If not provided we default to `spacy.blank("en")`. disable : list or None List of elements to remove from the pipeline. **add_pipe_kwargs : dict Additionally parameters to be passed into the `add_pipe` method. Note that one can control the position the ``EntityRuler`` this way. If not specified we put it at the very end. Returns ------- doc : spacy.Doc Doc containing the entities under the `ents` property. """ model = model or spacy.blank("en") disable = disable or [] add_pipe_kwargs = add_pipe_kwargs or {"last": True} er = model.add_pipe( "entity_ruler", config={"validate": True}, **add_pipe_kwargs ) er.add_patterns(self.to_list()) return model(text, disable=disable) def __eq__(self, other): """Determine if equal. Parameters ---------- other : PatternCreator Some other PatternCreator that we wish to compare to. Returns ------- bool If True, the patterns are identical. Note that the order does not matter. """ if not isinstance(other, self.__class__): return False self_df_unsorted = self.to_df() other_df_unsorted = other.to_df() if set(self_df_unsorted.columns) != set(other_df_unsorted.columns): return False sort_by = list(self_df_unsorted.columns) self_df_sorted = self_df_unsorted.sort_values(by=sort_by) other_df_sorted = other_df_unsorted.sort_values(by=sort_by) self_is_nan = self_df_sorted.isnull().values other_is_nan = other_df_sorted.isnull().values return np.array_equal(self_is_nan, other_is_nan) and np.array_equal( self_df_sorted.values[~self_is_nan], other_df_sorted.values[~other_is_nan] )
[docs] def add(self, label, pattern, check_exists=True): """Add a single raw in the patterns. Parameters ---------- label : str Entity type to associate with a given pattern. pattern : str or dict or list The pattern we want to match. The behavior depends on the type. - ``str``: can be used for exact matching (case sensitive). We internally convert it to a single-token pattern `{"TEXT": pattern}`. - ``dict``: a single-token pattern. This dictionary can contain at most 2 entries. The first one represents the attribute: value pair ("LEMMA": "world"). The second has a key "OP" and is optional. It represents the operator/quantifier to be used. An example of a valid pattern dict is `{"LEMMA": "world", "OP": "+"}`. Note that it would detect entities like "world" and "world world world". - ``list``: a multi-token pattern. A list of dictionaries that are of the same form as described above. check_exists : bool If True, we only allow to add patterns that do not exist yet. """ if isinstance(pattern, str): pattern_ = [{"TEXT": pattern}] elif isinstance(pattern, dict): pattern_ = [pattern] elif isinstance(pattern, list): pattern_ = pattern else: raise TypeError("Unsupported type of pattern") new_row = self.raw2row({"label": label, "pattern": pattern_}) new_storage = self._storage.append(new_row.to_frame().T, ignore_index=True) if check_exists and new_storage.duplicated().any(): raise ValueError("The pattern already exists") self._storage = new_storage
[docs] def drop(self, labels): """Drop one or multiple patterns. Parameters ---------- labels : int or list If ``int`` then represent a row index to be dropped. If ``list`` then a collection of row indices to be dropped. """ self._storage = self._storage.drop(index=labels).reset_index(drop=True)
[docs] def to_df(self): """Convert to a pd.DataFrame. Returns ------- pd.DataFrame Copy of the `_storage`. Each row represents a single entity type pattern. All elements are strings. """ return self._storage.copy()
[docs] def to_list(self, sort_by=None): """Convert to a list. Parameters ---------- sort_by : None or list If None, then no sorting taking place. If ``list``, then the names of columns along which to sort. Returns ------- list A list where each element represents one entity type pattern. Note that this list can be directly passed into the `EntityRuler`. """ storage = self.to_df() sorted_storage = ( storage.sort_values(by=sort_by) if sort_by is not None else storage ) return [self.row2raw(row) for _, row in sorted_storage.iterrows()]
[docs] def to_jsonl(self, path, sort_by=None): """Save to JSONL. Parameters ---------- path : pathlib.Path File where to save it. sort_by : None or list If None, then no sorting taking place. If ``list``, then the names of columns along which to sort. """ patterns = self.to_list(sort_by=sort_by) JSONL.dump_jsonl(patterns, path)
[docs] @classmethod def from_jsonl(cls, path): """Load from a JSONL file. Parameters ---------- path : pathlib.Path Path to a JSONL file with patterns. Returns ------- pattern_creator : bluesearch.mining.PatternCreator Instance of a ``PatternCreator``. """ inst = cls() patterns = JSONL.load_jsonl(path) for p in patterns: inst.add(label=p["label"], pattern=p["pattern"]) return inst
[docs] @staticmethod def raw2row(raw): """Convert an element of patterns list to a pd.Series. The goal of this function is to create a pd.Series with all entries being strings. This will allow us to check for duplicates between different rows really quickly. Parameters ---------- raw : dict Dictionary with two keys: "label" and "pattern". The `pattern` needs to be a list of dictionaries each representing a pattern for a given token. The `label` is a string representing the entity type. Returns ------- row : pd.Series The index contains the following elements: "label", "attribute_0", "value_0", "value_type_0", "op_0", "attribute_1", "value_1", "value_type_1", "op_1", ... """ if not isinstance(raw["label"], str): raise TypeError("The label needs to be a string") if not isinstance(raw["pattern"], list): raise TypeError("The pattern needs to be a list") d = {"label": raw["label"]} for token_ix, e in enumerate(raw["pattern"]): if not isinstance(e, dict): raise TypeError("The per token pattern needs to be a dictionary") if len(e) == 1: pass elif len(e) == 2 and "OP" in e: pass else: raise ValueError( "Invalid element, multi-attribute matches are not supported" ) attribute = next(filter(lambda key: key != "OP", e)) value_type = type(e[attribute]).__name__ value = str(e[attribute]) op = e.get("OP", "") d.update( { f"attribute_{token_ix}": attribute, f"value_{token_ix}": value, f"value_type_{token_ix}": value_type, f"op_{token_ix}": op, } ) return pd.Series(d)
[docs] @staticmethod def row2raw(row): """Convert pd.Series to a valid pattern dictionary. Note that the `value_{i}` is always a string, however, we cast it to `value_type_{i}` type. In most cases the type will be ``int``, ``str`` or ``dict``. Since this casting is done dynamically we use `eval`. Parameters ---------- row : pd.Series The index contains the following elements: "label", "attribute_0", "value_0", "value_type_0", "op_0", "attribute_1", "value_1", "value_type_1", "op_1", Returns ------- raw : dict Dictionary with two keys: "label" and "pattern". The `pattern` needs to be a list of dictionaries each representing a pattern for a given token. The `label` is a string representing the entity type. """ pattern = [] token_ix = 0 while True: try: attribute = row[f"attribute_{token_ix}"] # str value_str = row[f"value_{token_ix}"] # str value_type = row[f"value_type_{token_ix}"] # str op = row[f"op_{token_ix}"] # str if any( not isinstance(x, str) for x in [attribute, value_str, value_type, op] ): raise KeyError() if value_type != "str": try: value = ast.literal_eval(value_str) except ValueError as ve: if str(ve).startswith("malformed node or string"): raise NameError(str(ve)) from ve else: raise else: value = value_str token_pattern = {attribute: value} if op: token_pattern["OP"] = op pattern.append(token_pattern) except KeyError: break token_ix += 1 if token_ix == 0: raise ValueError("No valid pattern was found") return {"label": row["label"], "pattern": pattern}
[docs]def global2model_patterns(patterns, entity_type): """Remap entity types in the patterns to a specific model. For each entity type in the patterns try to see whether the model supports it and if not relabel the entity type to `NaE`. Parameters ---------- patterns : list List of patterns. entity_type : str Entity type detected by a spacy model. Returns ------- adjusted_patterns : list Patterns that are supposed to be for a specific spacy model. """ adjusted_patterns = copy.deepcopy(patterns) for p in adjusted_patterns: label = p["label"] if label.lower() != entity_type.lower(): p["label"] = "NaE" return adjusted_patterns
[docs]def check_patterns_agree(model, patterns): """Validate whether patterns of an existing model agree with given patterns. Parameters ---------- model : spacy.Language A model that contains an `EntityRuler`. patterns : list List of patterns. Returns ------- res : bool If True, the patterns agree. Raises ------ ValueError The model does not contain an entity ruler or it contains more than 1. """ all_er = [ pipe for _, pipe in model.pipeline if isinstance(pipe, spacy.pipeline.EntityRuler) ] if not all_er: raise ValueError("The model contains no EntityRuler") elif len(all_er) > 1: raise ValueError("The model contains more than 1 EntityRuler") else: return patterns == all_er.pop().patterns