Source code for bluesearch.widgets.mining_widget

"""Module for the mining widget."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import io
import json
import pathlib

import ipywidgets as widgets
import pandas as pd
import requests
from IPython.display import HTML, display

from .._css import style
from ..utils import Timer

DEFAULT_MINING_TEXT = """Autophagy maintains tumour growth through circulating
 arginine. Autophagy captures intracellular components and delivers them to
 lysosomes, where they are degraded and recycled to sustain metabolism and to
 enable survival during starvation. Acute, whole-body deletion of the essential
 autophagy gene Atg7 in adult mice causes a systemic metabolic defect that
 manifests as starvation intolerance and gradual loss of white adipose tissue,
 liver glycogen and muscle mass. Cancer cells also benefit from autophagy.
 Deletion of essential autophagy genes impairs the metabolism, proliferation,
 survival and malignancy of spontaneous tumours in models of autochthonous
 cancer. Acute, systemic deletion of Atg7 or acute, systemic expression of a
 dominant-negative ATG4b in mice induces greater regression of KRAS-driven
 cancers than does tumour-specific autophagy deletion, which suggests that host
 autophagy promotes tumour growth.""".replace(
    "\n", ""
)


[docs]class MiningWidget(widgets.VBox):
    """The mining widget.

    Parameters
    ----------
    mining_server_url : str
        The URL of the mining server.
    mining_schema : bluesearch.widgets.MiningSchema
        The requested mining schema (entity, relation, attribute types).
    article_saver : bluesearch.widgets.ArticleSaver
        An instance of the article saver.
    default_text : string, optional
        The default text assign to the text area.
    use_cache : bool
        If True the mining server will use cached mining results stored in an
        SQL database. Should lead to major speedups.
    checkpoint_path : str or pathlib.Path, optional
        Path where checkpoints are saved to and loaded from. If `None`, defaults
        to `~/.cache/bluesearch/widgets_checkpoints` folder.
    """

    def __init__(
        self,
        mining_server_url,
        mining_schema,
        article_saver=None,
        default_text=DEFAULT_MINING_TEXT,
        use_cache=True,
        checkpoint_path=None,
    ):
        super().__init__()

        self.mining_server_url = mining_server_url
        self.article_saver = article_saver
        self.mining_schema = mining_schema
        self.use_cache = use_cache

        # This is the output: csv table of extracted entities/relations.
        self.table_extractions = None

        # Define Widgets
        self.widgets = {}

        self._init_widgets(default_text)
        self._init_ui()

        response = requests.post(
            self.mining_server_url + "/help",
        )
        if not response.ok:
            raise Exception(
                f"It seems there is an issue with the bbs mining server. Response "
                f"status is {response.status_code} : {response.text}"
            )

        response_json = response.json()

        self.database_name = response_json["database"]  # e.g "cord19_v47"
        self.mining_server_version = response_json["version"]  # e.g. "0.0.9.dev2+g69"

        if checkpoint_path is not None:
            self.checkpoint_path = pathlib.Path(checkpoint_path)
        else:
            self.checkpoint_path = (
                pathlib.Path.home() / ".cache" / "bluesearch" / "widgets_checkpoints"
            )
        self.checkpoint_path = self.checkpoint_path / "bbs_mining.json"
        self.checkpoint_path.parent.mkdir(parents=True, exist_ok=True)

    def _init_widgets(self, default_text):
        # "Input Text" Widget
        self.widgets["input_text"] = widgets.Textarea(
            value=default_text, layout=widgets.Layout(width="75%", height="300px")
        )

        # "Mine This Text" button
        self.widgets["mine_text"] = widgets.Button(
            description="⚒️  Mine This Text!",
            layout=widgets.Layout(width="350px", height="50px"),
        )
        self.widgets["mine_text"].on_click(self._mine_text_clicked)
        self.widgets["mine_text"].add_class("bbs_button")

        # "Mine Selected Articles" button
        self.widgets["mine_articles"] = widgets.Button(
            description="⚒️  Mine Selected Articles!",
            layout=widgets.Layout(width="350px", height="50px"),
        )
        self.widgets["mine_articles"].on_click(self._mine_articles_clicked)
        self.widgets["mine_articles"].add_class("bbs_button")

        # Click to Save results
        self.widgets["save_button"] = widgets.Button(
            description="Save",
            icon="download",
            layout=widgets.Layout(width="172px", height="40px"),
        )
        self.widgets["save_button"].on_click(self._cb_bt_save)
        self.widgets["save_button"].add_class("bbs_button")

        # Click to Load results
        self.widgets["load_button"] = widgets.Button(
            description="Load",
            icon="upload",
            layout=widgets.Layout(width="172px", height="40px"),
        )
        self.widgets["load_button"].on_click(self._cb_bt_load)
        self.widgets["load_button"].add_class("bbs_button")

        # "Output Area" Widget
        self.widgets["out"] = widgets.Output(layout={"border": "0.5px solid black"})

        tabs = (
            (
                "Mine Articles",
                [
                    self.widgets["mine_articles"],
                ],
            ),
            (
                "Mine Text",
                [self.widgets["input_text"], self.widgets["mine_text"]],
            ),
        )

        tab_widget = widgets.Tab(children=[])
        for i, (tab_name, tab_children) in enumerate(tabs):
            tab_widget.children = tab_widget.children + (widgets.VBox(tab_children),)
            tab_widget.set_title(i, tab_name)
        self.widgets["mining"] = tab_widget

    def _init_ui(self):
        css_style = style.get_css_style()
        display(HTML(f"<style> {css_style} </style>"))

        self.children = [
            self.widgets["mining"],
            widgets.HBox(
                children=(self.widgets["save_button"], self.widgets["load_button"])
            ),
            self.widgets["out"],
        ]

[docs]    def textmining_pipeline(self, information, schema_df, debug=False):
        """Handle text mining server requests depending on the type of information.

        Parameters
        ----------
        information : str or list.
            Information can be either a raw string text, either a list of tuples
            (article_id, paragraph_id) related to the database.
        schema_df : pd.DataFrame
            A dataframe with the requested mining schema (entity, relation,
            attribute types).
        debug : bool
            If True, columns are not necessarily matching the specification.
            However, they contain debugging information. If False, then
            matching exactly the specification.

        Returns
        -------
        table_extractions : pd.DataFrame
            The final table. If `debug=True` then it contains all the
            metadata. If False then it only contains columns in the
            official specification.
        """
        schema_str = schema_df.to_csv(index=False)
        if isinstance(information, list):
            print(f"The widget is using database: {self.database_name}")
            response = requests.post(
                self.mining_server_url + "/database",
                json={
                    "identifiers": information,
                    "schema": schema_str,
                    "use_cache": self.use_cache,
                },
            )
        elif isinstance(information, str):
            response = requests.post(
                self.mining_server_url + "/text",
                json={"text": information, "schema": schema_str, "debug": debug},
            )
        else:
            raise TypeError("Wrong type for the information!")

        table_extractions = None
        if response.status_code == 200:
            response_dict = response.json()
            for warning_msg in response_dict["warnings"]:
                display(
                    HTML(
                        f'<div class="bbs_warning"> '
                        f"<b>WARNING!</b> {warning_msg} </div>"
                    )
                )
            with io.StringIO(response_dict["csv_extractions"]) as f:
                table_extractions = pd.read_csv(f)
        else:
            print("Server response is ERROR!")
            print(response.headers)
            print(response.text)

        return table_extractions

    def _mine_articles_clicked(self, b):
        self.widgets["out"].clear_output()

        if self.article_saver is None:
            with self.widgets["out"]:
                print("No article saver was provided. Nothing to mine.")
            return

        with self.widgets["out"]:
            timer = Timer()

            print("Collecting saved items...".ljust(50), end="", flush=True)
            with timer("collect items"):
                identifiers = self.article_saver.get_saved_items()

            print(f'{timer["collect items"]:7.2f} seconds')
            print("Mining request schema:")
            display(self.mining_schema.df)
            print("Running the mining pipeline...".ljust(50), end="", flush=True)
            with timer("pipeline"):
                self.table_extractions = self.textmining_pipeline(
                    information=identifiers, schema_df=self.mining_schema.df
                )
            print(f'{timer["pipeline"]:7.2f} seconds')

            display(self.table_extractions)

    def _mine_text_clicked(self, b):
        self.widgets["out"].clear_output()
        with self.widgets["out"]:
            print("Mining request schema:")
            display(self.mining_schema.df)
            print("Running the mining pipeline...".ljust(50), end="", flush=True)
            text = self.widgets["input_text"].value
            self.table_extractions = self.textmining_pipeline(
                information=text, schema_df=self.mining_schema.df
            )
            display(self.table_extractions)

    def _cb_bt_save(self, change_dict):
        with self.widgets["out"]:
            if self.table_extractions is None:
                message = """No mining results available. Did you forget
                             to run the mining pipeline on your selected
                             articles or text?"""
                display(HTML(f'<div class="bbs_error"> <b>ERROR!</b> {message} </div>'))
                return
            display(HTML("Saving mining results to disk...   "))
            data = {
                "mining_widget_extractions": self.table_extractions.to_dict(),
                "database_name": self.database_name,
                "mining_server_version": self.mining_server_version,
            }
            with self.checkpoint_path.open("w") as f:
                json.dump(data, f)
            display(HTML('<b class="bbs_success"> DONE!</b></br>'))

    def _cb_bt_load(self, change_dict):
        with self.widgets["out"]:
            self.widgets["out"].clear_output()
            if not self.checkpoint_path.exists():
                message = """No checkpoint file found to load. Did you forget to
                            save your mining results?"""
                display(
                    HTML(f'<div class="bbs_error"> ' f"<b>ERROR!</b> {message} </div>")
                )
                return
            display(HTML("Loading mining results from disk...   "))
            with self.checkpoint_path.open("r") as f:
                data = json.load(f)
            self.table_extractions = pd.DataFrame(data["mining_widget_extractions"])
            display(HTML('<b class="bbs_success"> DONE!</b></br>'))

            vers_load = data["mining_server_version"]
            vers_curr = self.mining_server_version
            db_load = data["database_name"]
            db_curr = self.database_name
            if db_load != db_curr or vers_load != vers_curr:
                message = f"""Loaded data from
                        <ul>
                            <li> mining server version = {vers_load} </li>
                            <li> database version = {db_load} </li>
                        </ul>
                        but current widget is connected to
                        <ul>
                            <li> mining server version = {vers_curr} </li>
                            <li> database version = {db_curr} </li>
                        </ul>
                        """
                display(
                    HTML(
                        f'<div class="bbs_warning"> '
                        f"<b>WARNING!</b> {message} </div>"
                    )
                )

            display(self.table_extractions)

    def _cb_chkb_show_mine_text_fct(self, change_dict):
        if change_dict["new"]:
            self.widgets["mine_text_fct"].layout.display = "block"
        else:
            self.widgets["mine_text_fct"].layout.display = "none"

[docs]    def get_extracted_table(self):
        """Retrieve the table with the mining results.

        Returns
        -------
        results_table : pandas.DataFrame
            The table with the mining results.
        """
        if self.table_extractions is not None:
            results_table = self.table_extractions.copy()
        else:
            results_table = pd.DataFrame()

        return results_table