Source code for bluesearch.database.mesh

#  Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
#  Copyright (C) 2022 Blue Brain Project, EPFL.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU Lesser General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Utilities for handling MeSH topic data."""
from __future__ import annotations

import collections
import json
import logging
import pathlib
import re
from collections.abc import Iterable
from typing import Generator, TextIO

logger = logging.getLogger(__name__)


[docs]class MeSHTree: """The hierarchical tree of MeSH topics. The MeSH topic ontology forms a tree with most general topics at the root and the most specific topics as the leaves. Here's a part of a MeSH topic hierarchy .. code-block:: text Natural Science Disciplines [H01] Biological Science Disciplines [H01.158] Biology [H01.158.273] Botany [H01.158.273.118] Ethnobotany [H01.158.273.118.299] Pharmacognosy [H01.158.273.118.598] Herbal Medicine [H01.158.273.118.598.500] Cell Biology [H01.158.273.160] ... The full data can be found in the NLM's MeSH browser under https://meshb.nlm.nih.gov/. The topics are uniquely identified by their tree number (e.g. `H01.158`), while the same topic label can appear in different places. Parameters ---------- tree_number_to_label The MeSH tree data. This dictionary should have tree numbers (e.g. `H01.158.273`) as keys, and topic labels (e.g. `Biology`) as values. """ def __init__(self, tree_number_to_label: dict[str, str]) -> None: self.tree_number_to_label = tree_number_to_label self.label_to_tree_numbers: dict[str, list[str]] = collections.defaultdict(list) for tree_number, label in tree_number_to_label.items(): self.label_to_tree_numbers[label].append(tree_number)
[docs] @classmethod def load(cls, path: pathlib.Path | str) -> MeSHTree: """Initialise the MeSH tree from a JSON file. Parameters ---------- path The path to the JSON file containing the MeSH tree data. See the `tree_number_to_label` parameter of the `MeSHTree` constructor for the data specification. Returns ------- MeSHTree An initialised instance of the MeSHTree. """ with open(path) as fh: tree_number_to_label = json.load(fh) return cls(tree_number_to_label)
[docs] @staticmethod def parents(tree_number: str) -> Generator[str, None, None]: """Generate all parent tree numbers. For example, given the tree number `H01.158.273` the parent tree numbers are `H01.158` and `H01`. Parameters ---------- tree_number A MeSH tree number, e.g. `H01.158.273`. Yields ------ The tree numbers of all parents of the given tree number. """ parts = tree_number.split(".") for n in reversed(range(1, len(parts))): yield ".".join(parts[:n])
[docs] def parent_topics(self, topic: str) -> set[str]: """Find all parent topic labels of a given topic. Note that a topic label does not have to be unique and can be assigned to multiple tree numbers. This method resolves all parent topics from all tree numbers that have the given label. Parameters ---------- topic A MeSH topic label. Returns ------- list All parent topic labels. """ parent_topics = set() for tree_number in self.label_to_tree_numbers[topic]: for parent in self.parents(tree_number): parent_topics.add(self.tree_number_to_label[parent]) return parent_topics
[docs]def resolve_parents(topics: Iterable[str], mesh_tree: MeSHTree) -> set[str]: """Enhance the topic list by parents of all given topics. Parameters ---------- topics A collection of MeSH topics. mesh_tree An instance of `MeSHTree`. Returns ------- set[str] A set with the input topics and all their parent topics. """ resolved = set(topics) for topic in topics: resolved |= mesh_tree.parent_topics(topic) return resolved
[docs]def parse_tree_numbers(nt_stream: TextIO) -> dict[str, str]: """Parse the MeSH topic tree from a stream of MeSH RDF N-tuples. Parameters ---------- nt_stream A text stream of MeSH RDF N-tuples. This is intended to work with the content of the MeSH files downloaded from the following website: https://nlmpubs.nlm.nih.gov/projects/mesh/rdf Returns ------- dict[str, str] A dictionary representing the parsed MeSH topic tree. The keys are the tree numbers that uniquely identify a topic. The values are the corresponding topic labels. Note that the topic labels are not unique. For example, the two tree numbers `F04.096.628.255.500` and `H01.158.610.030` have both the same label "Cognitive Neuroscience". """ id_to_label: dict[str, str] = {} id_to_tree_numbers = collections.defaultdict(list) # Regexes we need for parsing # Each line must be a triple subject, predicate, object. p_line = re.compile(r"(<.*>) (<.*>) (.*) \.") # We're only interested in subjects that represent descriptors. It appears # their ID is of the form "Dxxx..." where "xxx" are digits. p_desc = re.compile(r"<http://id\.nlm\.nih\.gov/mesh/\d{4}/(D\d{3,})>") # The topic label is in quotes and is followed by a language suffix. We'll # only keep labels that are in English p_en_label = re.compile(r"\"(.*)\"@en") # The "\d{4}" part is going to be the year, e.g. "2022", the actual tree # number is some combination of characters that we leave open. p_tree_number = re.compile(r"<http://id\.nlm\.nih\.gov/mesh/\d{4}/(.*)>") # The two predicates we'll be looking for pred_label = "<http://www.w3.org/2000/01/rdf-schema#label>" pred_tree_number = "<http://id.nlm.nih.gov/mesh/vocab#treeNumber>" for i, line in enumerate(nt_stream): if i % 1_000_000 == 0: logger.info(f"Parsed {i:,d} lines") # Parse the triple m_line = p_line.fullmatch(line.strip()) if not m_line: raise RuntimeError(f"The line is not a valid triple: {line!r}") subj, pred, obj = m_line.groups() # Extract the descriptor ID m_desc = p_desc.fullmatch(subj) if not m_desc: # Subject is not a descriptor continue id_ = m_desc.group(1) # Parse the descriptor label or tree number if pred == pred_label: m_label = p_en_label.fullmatch(obj) if not m_label: continue # not an English label label = m_label.group(1) if id_ in id_to_label: raise RuntimeError( f"Multiple labels for ID={id_}: {id_to_label[id_]}, {label}" ) id_to_label[id_] = label elif pred == pred_tree_number: m_tree_number = p_tree_number.fullmatch(obj) if not m_tree_number: raise RuntimeError(f"Cannot parse tree number: {obj}") id_to_tree_numbers[id_].append(m_tree_number.group(1)) # Given "id => label" and "id => tree numbers" find "tree number => label" logger.info("Labeling tree numbers") tree_number_to_label = {} for id_, label in id_to_label.items(): for tree_number in id_to_tree_numbers[id_]: if tree_number in tree_number_to_label: raise RuntimeError(f"Duplicate tree number: {tree_number}") tree_number_to_label[tree_number] = label return tree_number_to_label