Source code for bluesearch.entrypoint.database.convert_pdf

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the convert-pdf subcommand."""
from __future__ import annotations

import argparse
import logging
import pathlib
import textwrap
from concurrent.futures import ThreadPoolExecutor
from typing import Iterable

from bluesearch.database.pdf import grobid_is_alive, grobid_pdf_to_tei_xml

logger = logging.getLogger(__name__)


[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
    """Initialise the argument parser for the convert-pdf subcommand.

    Parameters
    ----------
    parser
        The argument parser to initialise.

    Returns
    -------
    argparse.ArgumentParser
        The initialised argument parser. The same object as the `parser`
        argument.
    """
    parser.formatter_class = argparse.RawDescriptionHelpFormatter
    description = """
    Parse a PDF file using the GROBID service and produce a TEI XML
    file. It's assumed that the GROBID service is running under the
    host/port combination provided.

    For more information on how to host such a service refer to the official
    documentation: https://grobid.readthedocs.io/en/latest/Grobid-docker
    """
    parser.description = textwrap.dedent(description)

    parser.add_argument(
        "grobid_host",
        type=str,
        metavar="GROBID-HOST",
        help="The host of the GROBID server.",
    )
    parser.add_argument(
        "grobid_port",
        type=int,
        metavar="GROBID-PORT",
        help="The port of the GROBID server.",
    )
    parser.add_argument(
        "input_path",
        type=pathlib.Path,
        metavar="INPUT-PATH",
        help="""
        The path to a single PDF file or a directory with many PDF files. In
        the latter case all files with the extension ".pdf" will be globbed
        recursively in all subdirectories.
        """,
    )
    parser.add_argument(
        "-o",
        "--output-dir",
        type=pathlib.Path,
        metavar="OUTPUT-DIR",
        help="""
        The output directory where the XML file(s) will be saved. If not
        provided the output files will be placed in the same directory as
        the input files.
        """,
    )
    parser.add_argument(
        "-w",
        "--num-workers",
        type=int,
        default=5,
        help="The number of workers",
    )
    parser.add_argument(
        "--force",
        "-f",
        action="store_true",
        help="""
        Overwrite the output files if they already exits. Without this flag
        all PDF files for which the output XML file already exists will
        be skipped
        """,
    )

    return parser


[docs]def run(
    grobid_host: str,
    grobid_port: int,
    input_path: pathlib.Path,
    output_dir: pathlib.Path | None,
    num_workers,
    *,
    force: bool,
) -> int:
    """Run the convert-pdf subcommand.

    Note that the names and types of the parameters should match the parser
    arguments added in ``init_parser``. The purpose of the matching is to be
    able to combine the functions in this way:

    >>> import argparse
    >>> from bluesearch.entrypoint.database import convert_pdf
    >>> parser = convert_pdf.init_parser(argparse.ArgumentParser())
    >>> # replace with true values and uncomment
    >>> argv = ["host", "port", "pdf_path", "xml_path"]
    >>> # args = parser.parse_args(argv)
    >>> # convert_pdf.run(**vars(args))

    This will run the convert-pdf subcommand implemented here as a standalone
    application.

    Parameters
    ----------
    grobid_host
        The host of the GROBID service.
    grobid_port
        The port of the GROBID service.
    input_path
        The path to the input PDF file or a directory with PDF files.
    output_dir
        The output directory for the XML files.
    num_workers
        The number of parallel workers.
    force
        If true overwrite the output file if it already exists.

    Returns
    -------
    int
        The exit code of the command
    """
    # Check the GROBID server
    if not grobid_is_alive(grobid_host, grobid_port):
        logger.error("The GROBID server is not alive")
        return 1

    # Check if the input file exists
    if not input_path.exists():
        logger.error(f"The input path {str(input_path)!r} does not exist")
        return 1

    # Collect input paths
    input_paths: Iterable[pathlib.Path]
    if input_path.is_file():
        input_paths = [input_path]
        input_dir = input_path.parent
    else:
        input_paths = input_path.rglob("*.pdf")
        input_dir = input_path

    # Set default output_dir as the same directory of input files
    output_dir = output_dir or input_dir

    path_map = _prepare_output_paths(input_paths, output_dir, force)

    if len(path_map) == 0:
        logger.warning("No files to process, stopping")
        return 0

    output_dir.mkdir(exist_ok=True)

    # Convert
    def do_work(
        path_map_item: tuple[pathlib.Path, pathlib.Path]
    ) -> pathlib.Path | None:
        """Try to run conversion, or return name of input pdf if that fails.

        Parameters
        ----------
        path_map_item
            Key-value pair of `input_pdf` and `output_xml` paths.

        Returns
        -------
        pathlib.Path | None
            Return `input_pdf` if conversion failed, otherwise None.
        """
        pdf_path, xml_path = path_map_item
        try:
            _convert_pdf_file(grobid_host, grobid_port, pdf_path, xml_path)
        except Exception as exc:
            logger.exception(
                f"An error happened when processing {pdf_path.resolve().as_uri()}: "
                f"{exc}"
            )
            return pdf_path
        return None

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        failed_paths = executor.map(do_work, path_map.items(), timeout=60)

    for path in failed_paths:
        if path is not None:
            logger.warning(f"Failed to process {path.resolve().as_uri()}")

    return 0


def _prepare_output_paths(
    input_paths: Iterable[pathlib.Path],
    output_dir: pathlib.Path,
    force: bool,
) -> dict[pathlib.Path, pathlib.Path]:
    """Assign output XML paths to all input PDF paths.

    Parameters
    ----------
    input_paths
        A sequence of input PDF paths.
    output_dir
        The output directory.
    force
        If False then the PDFs for which the outputs already exist will be
        skipped.

    Returns
    -------
    dict
        A mapping from input paths to output paths.
    """
    path_map = {}

    for input_path in input_paths:
        output_name = input_path.with_suffix(".xml").name
        output_path = output_dir / output_name

        if output_path.exists() and not force:
            logger.warning(
                "Not overwriting existing file %s, use --force to always overwrite.",
                output_path.resolve().as_uri(),
            )
        else:
            path_map[input_path] = output_path

    return path_map


def _convert_pdf_file(
    grobid_host: str,
    grobid_port: int,
    input_path: pathlib.Path,
    output_path: pathlib.Path,
) -> None:
    """Convert a single PDF file to XML and write the XML file to disk.

    Parameters
    ----------
    grobid_host
        The host of the GROBID service.
    grobid_port
        The port of the GROBID service.
    input_path
        The path to the input PDF file.
    output_path
        The output directory for the XML file.
    """
    logger.info(f"Reading {input_path.resolve().as_uri()}")
    with input_path.open("rb") as fh_pdf:
        pdf_content = fh_pdf.read()

    logger.info(f"Converting {input_path.resolve().as_uri()} to XML")
    xml_content = grobid_pdf_to_tei_xml(pdf_content, grobid_host, grobid_port)

    with output_path.open("w") as fh_xml:
        n_bytes = fh_xml.write(xml_content)
    logger.info(f"Wrote {output_path.resolve().as_uri()} ({n_bytes:,d} bytes)")