Source code for bluesearch.entrypoint.database.parse_mesh_rdf

#  Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
#  Copyright (C) 2022 Blue Brain Project, EPFL.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
"""CLI sub-command for parsing MeSH RDF files."""
from __future__ import annotations

import argparse
import gzip
import json
import logging
import pathlib

logger = logging.getLogger(__name__)


[docs]def init_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: """Initialise the argument parser for the parse-mesh-rdf subcommand. Parameters ---------- parser The argument parser to initialise. Returns ------- argparse.ArgumentParser The initialised argument parser. The same object as the `parser` argument. """ parser.description = "Parse a MeSH RDF file in N-Triples format." parser.add_argument( "mesh_nt_gz_file", type=pathlib.Path, help=""" Path to a "mesh*.nt.gz" file downloaded from https://nlmpubs.nlm.nih.gov/projects/mesh/rdf/ """, ) parser.add_argument( "output_json_file", type=pathlib.Path, help=""" The output file for parsing results. The JSON file will contain a flat dictionary with MeSH tree names as keys and corresponding topic labels as values. """, ) return parser
[docs]def run(*, mesh_nt_gz_file: pathlib.Path, output_json_file: pathlib.Path) -> int: """Parse a MeSH RDF file to extract the topic tree structure. See the description of the `init_parser` command for more information on the command and its parameters. """ from bluesearch.database import mesh if not mesh_nt_gz_file.exists(): logger.error(f"The file {mesh_nt_gz_file} does not exist.") return 1 if not mesh_nt_gz_file.is_file(): logger.error(f"The path {mesh_nt_gz_file} must be a file.") return 1 logger.info(f"Parsing the MeSH file {mesh_nt_gz_file.resolve().as_uri()}") with gzip.open(mesh_nt_gz_file, "rt") as fh: tree_number_to_label = mesh.parse_tree_numbers(fh) logger.info(f"Saving results to {output_json_file.resolve().as_uri()}") with open(output_json_file, "w") as fh: json.dump(tree_number_to_label, fh) logger.info("Done") return 0