Source code for bluesearch.entrypoint.mining_cache

"""EntryPoint for mining a database and saving of extracted items in a cache."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import argparse
import getpass
import logging
import pathlib
import sys

import sqlalchemy
from sqlalchemy.pool import NullPool

from bluesearch.entrypoint._helper import (
    CombinedHelpFormatter,
    configure_logging,
    parse_args_or_environment,
)
from bluesearch.utils import get_available_spacy_models


[docs]def run_create_mining_cache(argv=None):
    """Mine all texts in database and save results in a cache.

    Parameters
    ----------
    argv : list_like of str
        The command line arguments.
    """
    parser = argparse.ArgumentParser(
        description="Mine the CORD-19 database and cache the results.",
        formatter_class=CombinedHelpFormatter,
    )
    parser.add_argument(
        "--data-and-models-dir",
        type=str,
        help="""
        The local path to the "data_and_models" directory. It will
        be used to load the available spacy models from
        <data-and-models-dir>/models/ner_er/

        If missing, then the environment variable BBS_DATA_AND_MODELS_DIR
        will be read.
        """,
        default=argparse.SUPPRESS,
    )
    parser.add_argument(
        "--db-type",
        default="mysql",
        type=str,
        choices=("mysql", "sqlite"),
        help="Type of the database.",
    )
    parser.add_argument(
        "--db-url",
        type=str,
        help="""
        The location of the database depending on the database type.

        For MySQL the server URL should be provided, for SQLite the
        location of the database file. Generally, the scheme part of
        the URL should be omitted, e.g. for MySQL the URL should be
        of the form 'my_sql_server.ch:1234/my_database' and for SQLite
        of the form '/path/to/the/local/database.db'.

        If missing, then the environment variable DB_URL will
        be read.
        """,
        default=argparse.SUPPRESS,
    )
    parser.add_argument(
        "--target-table-name",
        default="mining_cache_temporary",
        type=str,
        help="The name of the target mining cache table",
    )
    parser.add_argument(
        "--n-processes-per-model",
        default=1,
        type=int,
        help="""
        Each mining model is run in parallel with respect to the others.
        In addition to that, n-processes-per-model are used to run in
        parallel a single mining model.
        """,
    )
    parser.add_argument(
        "--restrict-to-etypes",
        type=str,
        default=None,
        help="""
        Comma-separated list of entity types to detect
        to populate the cache. By default, all models in
        data_and_models/models/ner_er/ are run.
        """,
    )
    parser.add_argument(
        "--device",
        "-d",
        type=str,
        choices=["cpu", "cuda"],
        default="cpu",
        help="Device to use for the inference {'cpu', 'gpu'}.",
    )
    parser.add_argument(
        "--log-file",
        "-l",
        type=str,
        metavar="<filepath>",
        default=None,
        help="In addition to stderr, log messages to a file.",
    )
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="""
        The logging level. Possible values:
        - 50 for CRITICAL
        - 40 for ERROR
        - 30 for WARNING
        - 20 for INFO
        - 10 for DEBUG
        - 0 for NOTSET
        """,
    )

    # Parse CLI arguments
    env_variable_names = {
        "db_url": "DB_URL",
        "data_and_models_dir": "BBS_DATA_AND_MODELS_DIR",
    }
    args = parse_args_or_environment(parser, env_variable_names, argv=argv)

    # Configure logging
    configure_logging(args.log_file, args.log_level)

    logger = logging.getLogger("Mining cache entrypoint")

    logger.info(" Configuration ".center(80, "-"))
    for k, v in vars(args).items():
        logger.info(f"{k:<32}: {v}")
    logger.info("-" * 80)

    # Loading libraries
    logger.info("Loading libraries")
    from bluesearch.database.mining_cache import CreateMiningCache

    # Database type
    logger.info("Parsing the database type")
    if args.db_type == "sqlite":
        database_path = pathlib.Path(args.db_url)
        if not database_path.exists():
            raise FileNotFoundError(f"No database found at {database_path}.")
        database_url = f"sqlite:///{database_path}"
    elif args.db_type == "mysql":
        password = getpass.getpass("MySQL root password: ")
        database_url = f"mysql+pymysql://root:{password}@{args.db_url}"
    else:  # pragma: no cover
        # Will never get here because `parser.parse_args()` will fail first.
        # This is because we have choices=("mysql", "sqlite") in the
        # argparse parameters
        raise ValueError("Invalid database type specified under --db-type")

    # Create the database engine
    logger.info("Creating the database engine")
    # The NullPool prevents the Engine from using any connection more than once
    # This is important for multiprocessing
    database_engine = sqlalchemy.create_engine(database_url, poolclass=NullPool)

    # Load the models library
    logger.info("Loading the available spacy models")
    ee_models_paths = get_available_spacy_models(args.data_and_models_dir)

    # Restrict to given models
    if args.restrict_to_etypes is not None:
        logger.info("Restricting to a subset of entity types")
        etype_selection = args.restrict_to_etypes.split(",")
        etype_selection = set(map(lambda s: s.strip().upper(), etype_selection))
        for etype in etype_selection:
            if etype not in ee_models_paths:
                logger.warning(
                    f"Can't restrict to etype {etype} because it was not "
                    f"found in data_and_models folder. This entry will be ignored."
                )

        ee_models_paths = {
            etype: path
            for etype, path in ee_models_paths.items()
            if etype in etype_selection
        }

    # Create the cache creation class and run the cache creation
    logger.info("Creating the cache miner")
    cache_creator = CreateMiningCache(
        database_engine=database_engine,
        ee_models_paths=ee_models_paths,
        target_table_name=args.target_table_name,
        workers_per_model=args.n_processes_per_model,
        device=args.device,
    )

    logger.info("Launching the mining")
    cache_creator.construct()

    logger.info("All done, bye")


if __name__ == "__main__":  # pragma: no cover
    sys.exit(run_create_mining_cache())