Source code for bluesearch.widgets.mining_schema
"""Implementation of the MiningSchma class."""
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import warnings
import pandas as pd
[docs]class MiningSchema:
"""The mining schema for the mining widget."""
def __init__(self):
self.columns = (
"entity_type",
"property",
"property_type",
"property_value_type",
"ontology_source",
)
self.schema_df = pd.DataFrame(columns=self.columns)
[docs] def add_entity(
self,
entity_type,
property_name=None,
property_type=None,
property_value_type=None,
ontology_source=None,
):
"""Add a new entity to the schema.
A warning is issued for duplicate entities.
Parameters
----------
entity_type : str
The entity type, for example "CHEMICAL".
property_name: str, optional
The property name, for example "isChiral".
property_type : str, optional
The property type, for example "ATTRIBUTE".
property_value_type : str, optional
The property value type, for example "BOOLEAN".
ontology_source : str, optional
The ontology source, for example "NCIT".
"""
row = {
"entity_type": entity_type,
"property": property_name,
"property_type": property_type,
"property_value_type": property_value_type,
"ontology_source": ontology_source,
}
# Make sure there are no duplicates to begin with
self.schema_df = self.schema_df.drop_duplicates(ignore_index=True)
# 'row' has type Dict[str, Any]. It is valid for append(). Ignoring the error.
self.schema_df = self.schema_df.append(row, ignore_index=True) # type: ignore[arg-type] # noqa
# If there are any duplicates at this point, then it must have
# come from the appended row.
if any(self.schema_df.duplicated()):
self.schema_df = self.schema_df.drop_duplicates(ignore_index=True)
warnings.warn("This entry already exists. No new entry was created.")
[docs] def add_from_df(self, entity_df):
"""Add entities from a given dataframe.
The data frame has to contain a column named "entity_type". Any
columns matching the schema columns will be processed, all other
columns will be ignored.
Parameters
----------
entity_df : pd.DataFrame
The dataframe with new entities.
"""
# The dataframe must contain the "entity_type" column
if "entity_type" not in entity_df.columns:
raise ValueError("Column named entity_type not found.")
# Collect all other valid columns
valid_columns = []
for column in entity_df:
if column in self.schema_df.columns:
valid_columns.append(column)
else:
warnings.warn(f"No column named {column} was found.")
# Add new data to the schema
for _, row in entity_df[valid_columns].iterrows():
self.add_entity(
row["entity_type"],
property_name=row.get("property"),
property_type=row.get("property_type"),
property_value_type=row.get("property_value_type"),
ontology_source=row.get("ontology_source"),
)
@property
def df(self):
"""Get a dataframe with all entities.
Returns
-------
schema_df : pd.DataFrame
The dataframe with all entities.
"""
return self.schema_df.copy()