# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Extract topic of articles."""
from __future__ import annotations
import argparse
import gzip
import logging
from pathlib import Path
from typing import Any
from bluesearch.database import mesh
from bluesearch.database.article import ArticleSource
logger = logging.getLogger(__name__)
[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Initialise the argument parser for the topic-extract subcommand.
Parameters
----------
parser
The argument parser to initialise.
Returns
-------
argparse.ArgumentParser
The initialised argument parser. The same object as the `parser`
argument.
"""
parser.description = "Extract topic of articles."
parser.add_argument(
"source",
choices=[member.value for member in ArticleSource],
help="""
Format of the input.
If extracting topic of several articles, all articles must have the same format.
""",
)
parser.add_argument(
"input_path",
type=Path,
help="""
Path to a file or directory. If a directory, topic will be extracted for
all articles inside the directory.
""",
)
parser.add_argument(
"output_file",
type=Path,
help="""
Path to the file where the topic information will be written.
If it does not exist yet, the file is created.
""",
)
parser.add_argument(
"-m",
"--match-filename",
type=str,
help="""
Extract topic only of articles with a name matching the given regular
expression. Ignored when 'input_path' is a path to a file.
""",
)
parser.add_argument(
"-R",
"--recursive",
action="store_true",
help="""
Find articles recursively.
""",
)
parser.add_argument(
"-o",
"--overwrite",
action="store_true",
help="""
If output_file exists and overwrite is true, the output file is overwritten.
Otherwise, the topic extraction results are going
to be appended to the `output_file`.
""",
)
parser.add_argument(
"-n",
"--dry-run",
action="store_true",
help="""
Display files to parse without parsing them.
Especially useful when using '--match-filename' and / or '--recursive'.
""",
)
parser.add_argument(
"--mesh-topic-db",
type=Path,
help="""
The JSON file with MeSH topic hierarchy information. Mandatory for
source types "pmc" and "pubmed".
The JSON file should contain a flat dictionary with MeSH topic tree
numbers mapped to the corresponding topic labels. This file can be
produced using the `bbs_database parse-mesh-rdf` command. See that
command's description for more details.
""",
)
return parser
[docs]def run(
*,
source: str,
input_path: Path,
output_file: Path,
match_filename: str | None,
recursive: bool,
overwrite: bool,
dry_run: bool,
mesh_topic_db: Path | None = None,
) -> int:
"""Extract topic of articles.
Parameter description and potential defaults are documented inside of the
`init_parser` function.
"""
from defusedxml import ElementTree
from bluesearch.database.topic import (
extract_article_topics_for_pubmed_article,
extract_article_topics_from_medrxiv_article,
extract_journal_topics_for_pubmed_article,
get_topics_for_arxiv_articles,
get_topics_for_pmc_article,
)
from bluesearch.database.topic_info import TopicInfo
from bluesearch.utils import JSONL, find_files
try:
inputs = find_files(input_path, recursive, match_filename)
except ValueError:
logger.error(
"Argument 'input_path' should be a path "
"to an existing file or directory!"
)
return 1
if dry_run:
# Inputs are already sorted.
print(*inputs, sep="\n")
return 0
article_source = ArticleSource(source)
all_results: list[dict[str, Any]] = []
if article_source is ArticleSource.PMC:
if mesh_topic_db is None:
logger.error("The option --mesh-topics-db is mandatory for source type pmc")
return 1
mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
for path in inputs:
logger.info(f"Processing {path}")
topic_info = TopicInfo(source=article_source, path=path.resolve())
journal_topics = get_topics_for_pmc_article(path)
if journal_topics:
topic_info.add_journal_topics(
"MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
)
all_results.append(topic_info.json())
elif article_source is ArticleSource.PUBMED:
if mesh_topic_db is None:
logger.error(
"The option --mesh-topics-db is mandatory for source type pubmed"
)
return 1
mesh_tree = mesh.MeSHTree.load(mesh_topic_db)
for path in inputs:
logger.info(f"Processing {path}")
with gzip.open(input_path) as xml_stream:
articles = ElementTree.parse(xml_stream)
for i, article in enumerate(articles.iter("PubmedArticle")):
topic_info = TopicInfo(
source=article_source,
path=path.resolve(),
element_in_file=i,
)
article_topics = extract_article_topics_for_pubmed_article(article)
journal_topics = extract_journal_topics_for_pubmed_article(article)
if article_topics:
topic_info.add_article_topics(
"MeSH", mesh.resolve_parents(article_topics, mesh_tree)
)
if journal_topics:
topic_info.add_journal_topics(
"MeSH", mesh.resolve_parents(journal_topics, mesh_tree)
)
all_results.append(topic_info.json())
elif article_source is ArticleSource.ARXIV:
for path, article_topics in get_topics_for_arxiv_articles(inputs).items():
topic_info = TopicInfo(source=article_source, path=path)
topic_info.add_article_topics("arXiv", article_topics)
all_results.append(topic_info.json())
elif article_source in {ArticleSource.BIORXIV, ArticleSource.MEDRXIV}:
for path in inputs:
logger.info(f"Processing {path}")
topic, journal = extract_article_topics_from_medrxiv_article(path)
topic_info = TopicInfo(source=ArticleSource(journal), path=path)
topic_info.add_article_topics("Subject Area", [topic])
all_results.append(topic_info.json())
else:
logger.error(f"The source type {source!r} is not implemented yet")
return 1
JSONL.dump_jsonl(all_results, output_file, overwrite)
return 0