Source code for bluesearch.entrypoint.mining_cache

"""EntryPoint for mining a database and saving of extracted items in a cache."""

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import argparse
import getpass
import logging
import pathlib
import sys

import sqlalchemy
from sqlalchemy.pool import NullPool

from bluesearch.entrypoint._helper import (
    CombinedHelpFormatter,
    configure_logging,
    parse_args_or_environment,
)
from bluesearch.utils import get_available_spacy_models


[docs]def run_create_mining_cache(argv=None): """Mine all texts in database and save results in a cache. Parameters ---------- argv : list_like of str The command line arguments. """ parser = argparse.ArgumentParser( description="Mine the CORD-19 database and cache the results.", formatter_class=CombinedHelpFormatter, ) parser.add_argument( "--data-and-models-dir", type=str, help=""" The local path to the "data_and_models" directory. It will be used to load the available spacy models from <data-and-models-dir>/models/ner_er/ If missing, then the environment variable BBS_DATA_AND_MODELS_DIR will be read. """, default=argparse.SUPPRESS, ) parser.add_argument( "--db-type", default="mysql", type=str, choices=("mysql", "sqlite"), help="Type of the database.", ) parser.add_argument( "--db-url", type=str, help=""" The location of the database depending on the database type. For MySQL the server URL should be provided, for SQLite the location of the database file. Generally, the scheme part of the URL should be omitted, e.g. for MySQL the URL should be of the form 'my_sql_server.ch:1234/my_database' and for SQLite of the form '/path/to/the/local/database.db'. If missing, then the environment variable DB_URL will be read. """, default=argparse.SUPPRESS, ) parser.add_argument( "--target-table-name", default="mining_cache_temporary", type=str, help="The name of the target mining cache table", ) parser.add_argument( "--n-processes-per-model", default=1, type=int, help=""" Each mining model is run in parallel with respect to the others. In addition to that, n-processes-per-model are used to run in parallel a single mining model. """, ) parser.add_argument( "--restrict-to-etypes", type=str, default=None, help=""" Comma-separated list of entity types to detect to populate the cache. By default, all models in data_and_models/models/ner_er/ are run. """, ) parser.add_argument( "--device", "-d", type=str, choices=["cpu", "cuda"], default="cpu", help="Device to use for the inference {'cpu', 'gpu'}.", ) parser.add_argument( "--log-file", "-l", type=str, metavar="<filepath>", default=None, help="In addition to stderr, log messages to a file.", ) parser.add_argument( "--log-level", type=int, default=20, help=""" The logging level. Possible values: - 50 for CRITICAL - 40 for ERROR - 30 for WARNING - 20 for INFO - 10 for DEBUG - 0 for NOTSET """, ) # Parse CLI arguments env_variable_names = { "db_url": "DB_URL", "data_and_models_dir": "BBS_DATA_AND_MODELS_DIR", } args = parse_args_or_environment(parser, env_variable_names, argv=argv) # Configure logging configure_logging(args.log_file, args.log_level) logger = logging.getLogger("Mining cache entrypoint") logger.info(" Configuration ".center(80, "-")) for k, v in vars(args).items(): logger.info(f"{k:<32}: {v}") logger.info("-" * 80) # Loading libraries logger.info("Loading libraries") from bluesearch.database.mining_cache import CreateMiningCache # Database type logger.info("Parsing the database type") if args.db_type == "sqlite": database_path = pathlib.Path(args.db_url) if not database_path.exists(): raise FileNotFoundError(f"No database found at {database_path}.") database_url = f"sqlite:///{database_path}" elif args.db_type == "mysql": password = getpass.getpass("MySQL root password: ") database_url = f"mysql+pymysql://root:{password}@{args.db_url}" else: # pragma: no cover # Will never get here because `parser.parse_args()` will fail first. # This is because we have choices=("mysql", "sqlite") in the # argparse parameters raise ValueError("Invalid database type specified under --db-type") # Create the database engine logger.info("Creating the database engine") # The NullPool prevents the Engine from using any connection more than once # This is important for multiprocessing database_engine = sqlalchemy.create_engine(database_url, poolclass=NullPool) # Load the models library logger.info("Loading the available spacy models") ee_models_paths = get_available_spacy_models(args.data_and_models_dir) # Restrict to given models if args.restrict_to_etypes is not None: logger.info("Restricting to a subset of entity types") etype_selection = args.restrict_to_etypes.split(",") etype_selection = set(map(lambda s: s.strip().upper(), etype_selection)) for etype in etype_selection: if etype not in ee_models_paths: logger.warning( f"Can't restrict to etype {etype} because it was not " f"found in data_and_models folder. This entry will be ignored." ) ee_models_paths = { etype: path for etype, path in ee_models_paths.items() if etype in etype_selection } # Create the cache creation class and run the cache creation logger.info("Creating the cache miner") cache_creator = CreateMiningCache( database_engine=database_engine, ee_models_paths=ee_models_paths, target_table_name=args.target_table_name, workers_per_model=args.n_processes_per_model, device=args.device, ) logger.info("Launching the mining") cache_creator.construct() logger.info("All done, bye")
if __name__ == "__main__": # pragma: no cover sys.exit(run_create_mining_cache())