Source code for bluesearch.entrypoint.database.parse

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Parsing articles."""
from __future__ import annotations

import argparse
import gzip
import json
import logging
import sys
import warnings
from pathlib import Path
from typing import Iterator

from defusedxml import ElementTree

from bluesearch.database.article import (
    Article,
    ArticleParser,
    CORD19ArticleParser,
    JATSXMLParser,
    PubMedXMLParser,
    TEIXMLParser,
)

logger = logging.getLogger(__name__)


[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
    """Initialise the argument parser for the parse subcommand.

    Parameters
    ----------
    parser
        The argument parser to initialise.

    Returns
    -------
    argparse.ArgumentParser
        The initialised argument parser. The same object as the `parser`
        argument.
    """
    parser.description = "Parse one or several articles."

    parser.add_argument(
        "input_type",
        type=str,
        choices=(
            "cord19-json",
            "jats-xml",
            "jats-meca",
            "pubmed-xml",
            "pubmed-xml-set",
            "tei-xml",
            "tei-xml-arxiv",
        ),
        help="""
        Format of the input.
        If parsing several articles, all articles must have the same format.
        'jats-xml' could be used for articles from PubMed Central.
        'jats-meca' could be used for articles from bioRxiv and medRxiv.
        'tei-xml-arxiv' should be used for TEI XML generated from arXiv PDF articles.
        """,
    )
    parser.add_argument(
        "input_path",
        type=Path,
        help="""
        Path to a file or directory. If a directory, all articles
        inside the directory will be parsed. If not provided the
        user is supposed to pipe a space separated list of article paths to
        the standard input.
        """,
        nargs="?",
    )
    parser.add_argument(
        "output_dir",
        type=Path,
        help="""
        Path to a directory where parsed article(s) will be saved.
        If it does not exist yet, a directory with this path is created.
        """,
    )
    parser.add_argument(
        "-m",
        "--match-filename",
        type=str,
        help="""
        Parse only files with a name matching the given regular expression.
        Ignored when 'input_path' is a path to a file.
        """,
    )
    parser.add_argument(
        "-R",
        "--recursive",
        action="store_true",
        help="""
        Parse files recursively.
        """,
    )
    parser.add_argument(
        "-n",
        "--dry-run",
        action="store_true",
        help="""
        Display files to parse without parsing them.
        Especially useful when using '--match-filename' and / or '--recursive'.
        """,
    )
    return parser


[docs]def iter_parsers(input_type: str, input_path: Path) -> Iterator[ArticleParser]:
    """Return an iterator of initialized parsers for the given input."""
    if input_type == "cord19-json":
        with input_path.open() as f:
            data = json.load(f)
            yield CORD19ArticleParser(data)

    elif input_type == "jats-xml":
        yield JATSXMLParser.from_xml(input_path)

    elif input_type == "jats-meca":
        yield JATSXMLParser.from_zip(input_path)

    elif input_type == "pubmed-xml":
        yield PubMedXMLParser(input_path)

    elif input_type == "pubmed-xml-set":
        with gzip.open(input_path) as xml_stream:
            articles = ElementTree.parse(xml_stream)
        for article in articles.iter("PubmedArticle"):
            yield PubMedXMLParser(article)

    elif input_type.startswith("tei-xml"):
        if input_type == "tei-xml-arxiv":
            is_arxiv = True
        else:
            is_arxiv = False
        yield TEIXMLParser(input_path, is_arxiv=is_arxiv)

    else:
        raise ValueError(f"Unsupported input type '{input_type}'!")


[docs]def run(
    *,
    input_type: str,
    input_path: Path | None,
    output_dir: Path,
    match_filename: str | None,
    recursive: bool,
    dry_run: bool,
) -> int:
    """Parse one or several articles.

    Parameter description and potential defaults are documented inside of the
    `get_parser` function.
    """
    from bluesearch.utils import find_files

    if input_path is None:
        if sys.stdin.isatty():
            # Real terminal session
            logger.error("No input files provided")
            return 1
        else:
            # Piped session
            input_lines = sys.stdin.read().split()
            inputs = []

            for line in input_lines:
                path = Path(line)
                if path.exists():
                    inputs.append(path)

    else:
        try:
            inputs = find_files(input_path, recursive, match_filename)
        except ValueError:
            logger.error(
                "Argument 'input_path' should be a path "
                "to an existing file or directory!"
            )
            return 1

    if dry_run:
        # Inputs are already sorted.
        print(*inputs, sep="\n")
        return 0

    output_dir.mkdir(exist_ok=True)

    for input_path in inputs:
        logger.info(f"Parsing {input_path.name}")

        try:
            parsers = iter_parsers(input_type, input_path)

            for parser in parsers:
                article = Article.parse(parser)
                output_file = output_dir / f"{article.uid}.json"

                if output_file.exists():
                    raise FileExistsError(f"Output '{output_file}' already exists!")
                else:
                    serialized = article.to_json()
                    output_file.write_text(serialized, "utf-8")

        except Exception as e:
            warnings.warn(
                f'Failed parsing file "{input_path}":\n {e}', category=RuntimeWarning
            )

    logger.info("Parsing done")

    return 0