Source code for bluesearch.widgets.article_saver

"""Module for the article_saver."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import datetime
import pathlib
import textwrap

import pandas as pd

from .._css import style
from ..sql import (
    retrieve_article_metadata_from_article_id,
    retrieve_articles,
    retrieve_paragraph,
)


[docs]class ArticleSaver:
    """Keeps track of selected articles.

    This class can be used to save a number of articles and paragraphs
    for a later use. A typical use case is to keep track of the items
    selected in the search widget, and to retrieve them later in the
    mining widget.

    Furthermore this class allows to print a summary of all selected
    items using the `summary_table` method, to resolve all items into
    paragraphs with the corresponding section name and to summarize
    them in a pandas data frame using the method `get_chosen_texts`,
    and to export a PDF report of all saved items using the method
    `report`.

    Parameters
    ----------
    connection : sqlalchemy.engine.Engine
        An SQL database connectable compatible with `pandas.read_sql`.
        The database is supposed to have paragraphs and articles tables.

    Attributes
    ----------
    connection : sqlalchemy.engine.Engine
        An SQL database connectable compatible with `pandas.read_sql`.
        The database is supposed to have paragraphs and articles tables.
    state : set
        The state that keeps track of saved items. It is a set of tuples
        of the form `(article_id, paragraph_id)` each representing one
        saved item. The items with `paragraph_id = -1` indicate that the
        whole article should be saved.
    state_hash : int or None
        A hash uniquely identifying a certain state. This is used to
        cache `df_chosen_texts` and avoid recomputing it if the state
        has not changed.
    df_chosen_texts : pd.DataFrame
        The rows represent different paragraphs and the columns are
        'article_id', 'section_name', 'paragraph_id', 'text'.
    """

    def __init__(self, connection):
        self.connection = connection
        self.state = set()
        self.state_hash = None
        self.df_chosen_texts = pd.DataFrame(
            columns=["article_id", "section_name", "paragraph_pos_in_article", "text"]
        )

[docs]    def add_article(self, article_id):
        """Save an article.

        Parameters
        ----------
        article_id : int
            The article ID.
        """
        self.add_paragraph(article_id, -1)

[docs]    def add_paragraph(self, article_id, paragraph_pos_in_article):
        """Save a paragraph.

        Parameters
        ----------
        article_id : int
            The article ID.
        paragraph_pos_in_article : int
            The paragraph ID.
        """
        self.state.add((int(article_id), int(paragraph_pos_in_article)))

[docs]    def has_article(self, article_id):
        """Check if an article has been saved.

        Parameters
        ----------
        article_id : int
            The article ID.

        Returns
        -------
        result : bool
            Whether or not the given article has been saved.
        """
        return self.has_paragraph(article_id, -1)

[docs]    def has_paragraph(self, article_id, paragraph_pos_in_article):
        """Check if a paragraph has been saved.

        Parameters
        ----------
        article_id : int
            The article ID.
        paragraph_pos_in_article : int
            The paragraph ID.

        Returns
        -------
        result : bool
            Whether or not the given paragraph has been saved.
        """
        return (int(article_id), int(paragraph_pos_in_article)) in self.state

[docs]    def remove_article(self, article_id):
        """Remove an article from saved.

        Parameters
        ----------
        article_id : int
            The article ID.
        """
        self.remove_paragraph(article_id, -1)

[docs]    def remove_paragraph(self, article_id, paragraph_pos_in_article):
        """Remove a paragraph from saved.

        Parameters
        ----------
        article_id : int
            The article ID.
        paragraph_pos_in_article : int
            The paragraph ID.
        """
        if (article_id, paragraph_pos_in_article) in self.state:
            self.state.remove((article_id, paragraph_pos_in_article))

[docs]    def remove_all(self):
        """Remove all saved items."""
        self.state.clear()

    def _get_clean_state(self):
        """Get a clean state of the article saver.

        Returns
        -------
        full_articles : set of int
            Set of the article ids chosen by the user.
        just_paragraphs : set of tuple
            Set of tuple (article_id, paragraph_pos_in_article) chosen by the user.
        """
        full_articles = {
            article_id
            for article_id, paragraph_pos_in_article in self.state
            if paragraph_pos_in_article == -1
        }
        just_paragraphs = {
            (article_id, paragraph_pos_in_article)
            for article_id, paragraph_pos_in_article in self.state
            if paragraph_pos_in_article != -1 and article_id not in full_articles
        }
        return full_articles, just_paragraphs

[docs]    def get_saved_items(self):
        """Retrieve the saved items that summarize the choice of the users.

        Returns
        -------
        identifiers : list of tuple
            Tuple (article_id, paragraph_pos_in_article) chosen by the user.
        """
        saved_items = []
        full_articles, just_paragraphs = self._get_clean_state()
        for article_id in full_articles:
            saved_items += [(article_id, -1)]
        saved_items += just_paragraphs
        return saved_items

    def _update_chosen_texts(self):
        """Recompute the chosen texts."""
        # empty all rows
        self.df_chosen_texts = self.df_chosen_texts[0:0]

        full_articles, just_paragraphs = self._get_clean_state()

        articles = retrieve_articles(article_ids=full_articles, engine=self.connection)
        self.df_chosen_texts = self.df_chosen_texts.append(articles)

        for (article_id, paragraph_pos_in_article) in just_paragraphs:
            paragraph = retrieve_paragraph(
                article_id, paragraph_pos_in_article, engine=self.connection
            )
            self.df_chosen_texts = self.df_chosen_texts.append(paragraph)

[docs]    def get_chosen_texts(self):
        """Retrieve the currently saved items.

        For all entire articles that are saved the corresponding
        paragraphs are resolved first.

        Returns
        -------
        df_chosen_texts : pandas.DataFrame
        """
        state_hash = hash(tuple(sorted(self.state)))
        if state_hash != self.state_hash:
            self._update_chosen_texts()
            self.state_hash = state_hash

        return self.df_chosen_texts.copy()

    def _fetch_article_info(self, article_id):
        article = retrieve_article_metadata_from_article_id(
            article_id=article_id, engine=self.connection
        )
        article_authors, article_title, ref = article.iloc[0][
            ["authors", "title", "url"]
        ]

        return ref, article_title, article_authors

[docs]    def make_report(self, output_dir=None):
        """Create the saved articles report.

        Parameters
        ----------
        output_dir : str or pathlib.Path
            The directory for writing the report.

        Returns
        -------
        output_file_path : pathlib.Path
            The file to which the report was written.
        """
        css_style = style.get_css_style()
        article_report = f"<style> {css_style} </style>"
        width = 80

        df_chosen_texts = self.get_chosen_texts()

        for article_id, df_article in df_chosen_texts.groupby("article_id"):
            df_article = df_article.sort_values(
                by="paragraph_pos_in_article", ascending=True, axis=0
            )
            if len(df_article["section_name"].unique()) == 1:
                section_name = df_article["section_name"].iloc[0]
            else:
                section_name = (
                    f'{len(df_article["section_name"].unique())} different '
                    f"sections are selected for this article."
                )
            ref, article_title, article_authors = self._fetch_article_info(article_id)
            article_metadata = f"""
            <a href="{ref}">
                <div class="article_title">
                    {article_title}
                </div>
            </a>
            <div class="metadata">
                {article_authors} &#183; {section_name.lower().title()}
            </div>
            """
            article_report += article_metadata

            article_report += "<br/>".join(
                (textwrap.fill(t_, width=width) for t_ in df_article.text)
            )
            article_report += "<br/>" * 2

        if output_dir is None:
            output_dir = pathlib.Path.cwd()
        else:
            output_dir = pathlib.Path(output_dir)
            if not output_dir.exists():
                msg = f"The output directory {output_dir} does not exist."
                raise ValueError(msg)
        output_file_path = (
            output_dir / f"article_saver_report_{datetime.datetime.now()}.html"
        )
        with output_file_path.open("w") as f:
            f.write("<!DOCTYPE html>\n")
            f.write(article_report)

        return output_file_path

[docs]    def summary_table(self):
        """Create a dataframe table with saved articles.

        Returns
        -------
        table : pd.DataFrame
            DataFrame containing all the paragraphs seen and choice made for it.
        """
        rows = []
        for article_id, paragraph_pos_in_article in self.state:
            if paragraph_pos_in_article == -1:
                option = "Save full article"
            else:
                option = "Save paragraph"
            rows.append(
                {
                    "article_id": article_id,
                    "paragraph_pos_in_article": paragraph_pos_in_article,
                    "option": option,
                }
            )
        table = pd.DataFrame(
            data=rows, columns=["article_id", "paragraph_pos_in_article", "option"]
        )

        return table