Source code for bluesearch.entrypoint.database.parse

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Parsing articles."""
from __future__ import annotations

import argparse
import gzip
import json
import logging
import sys
import warnings
from pathlib import Path
from typing import Iterator

from defusedxml import ElementTree

from bluesearch.database.article import (
    Article,
    ArticleParser,
    CORD19ArticleParser,
    JATSXMLParser,
    PubMedXMLParser,
    TEIXMLParser,
)

logger = logging.getLogger(__name__)


[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Initialise the argument parser for the parse subcommand. Parameters ---------- parser The argument parser to initialise. Returns ------- argparse.ArgumentParser The initialised argument parser. The same object as the `parser` argument. """ parser.description = "Parse one or several articles." parser.add_argument( "input_type", type=str, choices=( "cord19-json", "jats-xml", "jats-meca", "pubmed-xml", "pubmed-xml-set", "tei-xml", "tei-xml-arxiv", ), help=""" Format of the input. If parsing several articles, all articles must have the same format. 'jats-xml' could be used for articles from PubMed Central. 'jats-meca' could be used for articles from bioRxiv and medRxiv. 'tei-xml-arxiv' should be used for TEI XML generated from arXiv PDF articles. """, ) parser.add_argument( "input_path", type=Path, help=""" Path to a file or directory. If a directory, all articles inside the directory will be parsed. If not provided the user is supposed to pipe a space separated list of article paths to the standard input. """, nargs="?", ) parser.add_argument( "output_dir", type=Path, help=""" Path to a directory where parsed article(s) will be saved. If it does not exist yet, a directory with this path is created. """, ) parser.add_argument( "-m", "--match-filename", type=str, help=""" Parse only files with a name matching the given regular expression. Ignored when 'input_path' is a path to a file. """, ) parser.add_argument( "-R", "--recursive", action="store_true", help=""" Parse files recursively. """, ) parser.add_argument( "-n", "--dry-run", action="store_true", help=""" Display files to parse without parsing them. Especially useful when using '--match-filename' and / or '--recursive'. """, ) return parser
[docs]def iter_parsers(input_type: str, input_path: Path) -> Iterator[ArticleParser]: """Return an iterator of initialized parsers for the given input.""" if input_type == "cord19-json": with input_path.open() as f: data = json.load(f) yield CORD19ArticleParser(data) elif input_type == "jats-xml": yield JATSXMLParser.from_xml(input_path) elif input_type == "jats-meca": yield JATSXMLParser.from_zip(input_path) elif input_type == "pubmed-xml": yield PubMedXMLParser(input_path) elif input_type == "pubmed-xml-set": with gzip.open(input_path) as xml_stream: articles = ElementTree.parse(xml_stream) for article in articles.iter("PubmedArticle"): yield PubMedXMLParser(article) elif input_type.startswith("tei-xml"): if input_type == "tei-xml-arxiv": is_arxiv = True else: is_arxiv = False yield TEIXMLParser(input_path, is_arxiv=is_arxiv) else: raise ValueError(f"Unsupported input type '{input_type}'!")
[docs]def run( *, input_type: str, input_path: Path | None, output_dir: Path, match_filename: str | None, recursive: bool, dry_run: bool, ) -> int: """Parse one or several articles. Parameter description and potential defaults are documented inside of the `get_parser` function. """ from bluesearch.utils import find_files if input_path is None: if sys.stdin.isatty(): # Real terminal session logger.error("No input files provided") return 1 else: # Piped session input_lines = sys.stdin.read().split() inputs = [] for line in input_lines: path = Path(line) if path.exists(): inputs.append(path) else: try: inputs = find_files(input_path, recursive, match_filename) except ValueError: logger.error( "Argument 'input_path' should be a path " "to an existing file or directory!" ) return 1 if dry_run: # Inputs are already sorted. print(*inputs, sep="\n") return 0 output_dir.mkdir(exist_ok=True) for input_path in inputs: logger.info(f"Parsing {input_path.name}") try: parsers = iter_parsers(input_type, input_path) for parser in parsers: article = Article.parse(parser) output_file = output_dir / f"{article.uid}.json" if output_file.exists(): raise FileExistsError(f"Output '{output_file}' already exists!") else: serialized = article.to_json() output_file.write_text(serialized, "utf-8") except Exception as e: warnings.warn( f'Failed parsing file "{input_path}":\n {e}', category=RuntimeWarning ) logger.info("Parsing done") return 0