Source code for bluesearch.widgets.mining_schema

"""Implementation of the MiningSchma class."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import warnings

import pandas as pd


[docs]class MiningSchema: """The mining schema for the mining widget.""" def __init__(self): self.columns = ( "entity_type", "property", "property_type", "property_value_type", "ontology_source", ) self.schema_df = pd.DataFrame(columns=self.columns)
[docs] def add_entity( self, entity_type, property_name=None, property_type=None, property_value_type=None, ontology_source=None, ): """Add a new entity to the schema. A warning is issued for duplicate entities. Parameters ---------- entity_type : str The entity type, for example "CHEMICAL". property_name: str, optional The property name, for example "isChiral". property_type : str, optional The property type, for example "ATTRIBUTE". property_value_type : str, optional The property value type, for example "BOOLEAN". ontology_source : str, optional The ontology source, for example "NCIT". """ row = { "entity_type": entity_type, "property": property_name, "property_type": property_type, "property_value_type": property_value_type, "ontology_source": ontology_source, } # Make sure there are no duplicates to begin with self.schema_df = self.schema_df.drop_duplicates(ignore_index=True) # 'row' has type Dict[str, Any]. It is valid for append(). Ignoring the error. self.schema_df = self.schema_df.append(row, ignore_index=True) # type: ignore[arg-type] # noqa # If there are any duplicates at this point, then it must have # come from the appended row. if any(self.schema_df.duplicated()): self.schema_df = self.schema_df.drop_duplicates(ignore_index=True) warnings.warn("This entry already exists. No new entry was created.")
[docs] def add_from_df(self, entity_df): """Add entities from a given dataframe. The data frame has to contain a column named "entity_type". Any columns matching the schema columns will be processed, all other columns will be ignored. Parameters ---------- entity_df : pd.DataFrame The dataframe with new entities. """ # The dataframe must contain the "entity_type" column if "entity_type" not in entity_df.columns: raise ValueError("Column named entity_type not found.") # Collect all other valid columns valid_columns = [] for column in entity_df: if column in self.schema_df.columns: valid_columns.append(column) else: warnings.warn(f"No column named {column} was found.") # Add new data to the schema for _, row in entity_df[valid_columns].iterrows(): self.add_entity( row["entity_type"], property_name=row.get("property"), property_type=row.get("property_type"), property_value_type=row.get("property_value_type"), ontology_source=row.get("ontology_source"), )
@property def df(self): """Get a dataframe with all entities. Returns ------- schema_df : pd.DataFrame The dataframe with all entities. """ return self.schema_df.copy()