Source code for bluesearch.database.mesh
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2022 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Utilities for handling MeSH topic data."""
from __future__ import annotations
import collections
import json
import logging
import pathlib
import re
from collections.abc import Iterable
from typing import Generator, TextIO
logger = logging.getLogger(__name__)
[docs]class MeSHTree:
"""The hierarchical tree of MeSH topics.
The MeSH topic ontology forms a tree with most general topics at the
root and the most specific topics as the leaves. Here's a part of a MeSH
topic hierarchy
.. code-block:: text
Natural Science Disciplines [H01]
Biological Science Disciplines [H01.158]
Biology [H01.158.273]
Botany [H01.158.273.118]
Ethnobotany [H01.158.273.118.299]
Pharmacognosy [H01.158.273.118.598]
Herbal Medicine [H01.158.273.118.598.500]
Cell Biology [H01.158.273.160]
...
The full data can be found in the NLM's MeSH browser under
https://meshb.nlm.nih.gov/.
The topics are uniquely identified by their tree number (e.g. `H01.158`),
while the same topic label can appear in different places.
Parameters
----------
tree_number_to_label
The MeSH tree data. This dictionary should have tree numbers
(e.g. `H01.158.273`) as keys, and topic labels (e.g. `Biology`)
as values.
"""
def __init__(self, tree_number_to_label: dict[str, str]) -> None:
self.tree_number_to_label = tree_number_to_label
self.label_to_tree_numbers: dict[str, list[str]] = collections.defaultdict(list)
for tree_number, label in tree_number_to_label.items():
self.label_to_tree_numbers[label].append(tree_number)
[docs] @classmethod
def load(cls, path: pathlib.Path | str) -> MeSHTree:
"""Initialise the MeSH tree from a JSON file.
Parameters
----------
path
The path to the JSON file containing the MeSH tree data. See
the `tree_number_to_label` parameter of the `MeSHTree`
constructor for the data specification.
Returns
-------
MeSHTree
An initialised instance of the MeSHTree.
"""
with open(path) as fh:
tree_number_to_label = json.load(fh)
return cls(tree_number_to_label)
[docs] @staticmethod
def parents(tree_number: str) -> Generator[str, None, None]:
"""Generate all parent tree numbers.
For example, given the tree number `H01.158.273` the parent tree
numbers are `H01.158` and `H01`.
Parameters
----------
tree_number
A MeSH tree number, e.g. `H01.158.273`.
Yields
------
The tree numbers of all parents of the given tree number.
"""
parts = tree_number.split(".")
for n in reversed(range(1, len(parts))):
yield ".".join(parts[:n])
[docs] def parent_topics(self, topic: str) -> set[str]:
"""Find all parent topic labels of a given topic.
Note that a topic label does not have to be unique and can be
assigned to multiple tree numbers. This method resolves all
parent topics from all tree numbers that have the given label.
Parameters
----------
topic
A MeSH topic label.
Returns
-------
list
All parent topic labels.
"""
parent_topics = set()
for tree_number in self.label_to_tree_numbers[topic]:
for parent in self.parents(tree_number):
parent_topics.add(self.tree_number_to_label[parent])
return parent_topics
[docs]def resolve_parents(topics: Iterable[str], mesh_tree: MeSHTree) -> set[str]:
"""Enhance the topic list by parents of all given topics.
Parameters
----------
topics
A collection of MeSH topics.
mesh_tree
An instance of `MeSHTree`.
Returns
-------
set[str]
A set with the input topics and all their parent topics.
"""
resolved = set(topics)
for topic in topics:
resolved |= mesh_tree.parent_topics(topic)
return resolved
[docs]def parse_tree_numbers(nt_stream: TextIO) -> dict[str, str]:
"""Parse the MeSH topic tree from a stream of MeSH RDF N-tuples.
Parameters
----------
nt_stream
A text stream of MeSH RDF N-tuples. This is intended to work with
the content of the MeSH files downloaded from the following website:
https://nlmpubs.nlm.nih.gov/projects/mesh/rdf
Returns
-------
dict[str, str]
A dictionary representing the parsed MeSH topic tree. The keys are
the tree numbers that uniquely identify a topic. The values are the
corresponding topic labels. Note that the topic labels are not
unique. For example, the two tree numbers `F04.096.628.255.500` and
`H01.158.610.030` have both the same label "Cognitive Neuroscience".
"""
id_to_label: dict[str, str] = {}
id_to_tree_numbers = collections.defaultdict(list)
# Regexes we need for parsing
# Each line must be a triple subject, predicate, object.
p_line = re.compile(r"(<.*>) (<.*>) (.*) \.")
# We're only interested in subjects that represent descriptors. It appears
# their ID is of the form "Dxxx..." where "xxx" are digits.
p_desc = re.compile(r"<http://id\.nlm\.nih\.gov/mesh/\d{4}/(D\d{3,})>")
# The topic label is in quotes and is followed by a language suffix. We'll
# only keep labels that are in English
p_en_label = re.compile(r"\"(.*)\"@en")
# The "\d{4}" part is going to be the year, e.g. "2022", the actual tree
# number is some combination of characters that we leave open.
p_tree_number = re.compile(r"<http://id\.nlm\.nih\.gov/mesh/\d{4}/(.*)>")
# The two predicates we'll be looking for
pred_label = "<http://www.w3.org/2000/01/rdf-schema#label>"
pred_tree_number = "<http://id.nlm.nih.gov/mesh/vocab#treeNumber>"
for i, line in enumerate(nt_stream):
if i % 1_000_000 == 0:
logger.info(f"Parsed {i:,d} lines")
# Parse the triple
m_line = p_line.fullmatch(line.strip())
if not m_line:
raise RuntimeError(f"The line is not a valid triple: {line!r}")
subj, pred, obj = m_line.groups()
# Extract the descriptor ID
m_desc = p_desc.fullmatch(subj)
if not m_desc:
# Subject is not a descriptor
continue
id_ = m_desc.group(1)
# Parse the descriptor label or tree number
if pred == pred_label:
m_label = p_en_label.fullmatch(obj)
if not m_label:
continue # not an English label
label = m_label.group(1)
if id_ in id_to_label:
raise RuntimeError(
f"Multiple labels for ID={id_}: {id_to_label[id_]}, {label}"
)
id_to_label[id_] = label
elif pred == pred_tree_number:
m_tree_number = p_tree_number.fullmatch(obj)
if not m_tree_number:
raise RuntimeError(f"Cannot parse tree number: {obj}")
id_to_tree_numbers[id_].append(m_tree_number.group(1))
# Given "id => label" and "id => tree numbers" find "tree number => label"
logger.info("Labeling tree numbers")
tree_number_to_label = {}
for id_, label in id_to_label.items():
for tree_number in id_to_tree_numbers[id_]:
if tree_number in tree_number_to_label:
raise RuntimeError(f"Duplicate tree number: {tree_number}")
tree_number_to_label[tree_number] = label
return tree_number_to_label