Source code for bluesearch.entrypoint.database.topic_extract

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Extract topic of articles."""
from __future__ import annotations

import argparse
import gzip
import logging
from pathlib import Path
from typing import Any

from bluesearch.database import mesh
from bluesearch.database.article import ArticleSource

logger = logging.getLogger(__name__)


[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Initialise the argument parser for the topic-extract subcommand. Parameters ---------- parser The argument parser to initialise. Returns ------- argparse.ArgumentParser The initialised argument parser. The same object as the `parser` argument. """ parser.description = "Extract topic of articles." parser.add_argument( "source", choices=[member.value for member in ArticleSource], help=""" Format of the input. If extracting topic of several articles, all articles must have the same format. """, ) parser.add_argument( "input_path", type=Path, help=""" Path to a file or directory. If a directory, topic will be extracted for all articles inside the directory. """, ) parser.add_argument( "output_file", type=Path, help=""" Path to the file where the topic information will be written. If it does not exist yet, the file is created. """, ) parser.add_argument( "-m", "--match-filename", type=str, help=""" Extract topic only of articles with a name matching the given regular expression. Ignored when 'input_path' is a path to a file. """, ) parser.add_argument( "-R", "--recursive", action="store_true", help=""" Find articles recursively. """, ) parser.add_argument( "-o", "--overwrite", action="store_true", help=""" If output_file exists and overwrite is true, the output file is overwritten. Otherwise, the topic extraction results are going to be appended to the `output_file`. """, ) parser.add_argument( "-n", "--dry-run", action="store_true", help=""" Display files to parse without parsing them. Especially useful when using '--match-filename' and / or '--recursive'. """, ) parser.add_argument( "--mesh-topic-db", type=Path, help=""" The JSON file with MeSH topic hierarchy information. Mandatory for source types "pmc" and "pubmed". The JSON file should contain a flat dictionary with MeSH topic tree numbers mapped to the corresponding topic labels. This file can be produced using the `bbs_database parse-mesh-rdf` command. See that command's description for more details. """, ) return parser
[docs]def run( *, source: str, input_path: Path, output_file: Path, match_filename: str | None, recursive: bool, overwrite: bool, dry_run: bool, mesh_topic_db: Path | None = None, ) -> int: """Extract topic of articles. Parameter description and potential defaults are documented inside of the `init_parser` function. """ from defusedxml import ElementTree from bluesearch.database.topic import ( extract_article_topics_for_pubmed_article, extract_article_topics_from_medrxiv_article, extract_journal_topics_for_pubmed_article, get_topics_for_arxiv_articles, get_topics_for_pmc_article, ) from bluesearch.database.topic_info import TopicInfo from bluesearch.utils import JSONL, find_files try: inputs = find_files(input_path, recursive, match_filename) except ValueError: logger.error( "Argument 'input_path' should be a path " "to an existing file or directory!" ) return 1 if dry_run: # Inputs are already sorted. print(*inputs, sep="\n") return 0 article_source = ArticleSource(source) all_results: list[dict[str, Any]] = [] if article_source is ArticleSource.PMC: if mesh_topic_db is None: logger.error("The option --mesh-topics-db is mandatory for source type pmc") return 1 mesh_tree = mesh.MeSHTree.load(mesh_topic_db) for path in inputs: logger.info(f"Processing {path}") topic_info = TopicInfo(source=article_source, path=path.resolve()) journal_topics = get_topics_for_pmc_article(path) if journal_topics: topic_info.add_journal_topics( "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) ) all_results.append(topic_info.json()) elif article_source is ArticleSource.PUBMED: if mesh_topic_db is None: logger.error( "The option --mesh-topics-db is mandatory for source type pubmed" ) return 1 mesh_tree = mesh.MeSHTree.load(mesh_topic_db) for path in inputs: logger.info(f"Processing {path}") with gzip.open(input_path) as xml_stream: articles = ElementTree.parse(xml_stream) for i, article in enumerate(articles.iter("PubmedArticle")): topic_info = TopicInfo( source=article_source, path=path.resolve(), element_in_file=i, ) article_topics = extract_article_topics_for_pubmed_article(article) journal_topics = extract_journal_topics_for_pubmed_article(article) if article_topics: topic_info.add_article_topics( "MeSH", mesh.resolve_parents(article_topics, mesh_tree) ) if journal_topics: topic_info.add_journal_topics( "MeSH", mesh.resolve_parents(journal_topics, mesh_tree) ) all_results.append(topic_info.json()) elif article_source is ArticleSource.ARXIV: for path, article_topics in get_topics_for_arxiv_articles(inputs).items(): topic_info = TopicInfo(source=article_source, path=path) topic_info.add_article_topics("arXiv", article_topics) all_results.append(topic_info.json()) elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}: for path in inputs: logger.info(f"Processing {path}") topic, journal = extract_article_topics_from_medrxiv_article(path) topic_info = TopicInfo(source=ArticleSource(journal), path=path) topic_info.add_article_topics("Subject Area", [topic]) all_results.append(topic_info.json()) else: logger.error(f"The source type {source!r} is not implemented yet") return 1 JSONL.dump_jsonl(all_results, output_file, overwrite) return 0