Source code for bluesearch.database.topic

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Utils for journal/articles topics."""
from __future__ import annotations

import html
import logging
import pathlib
import re
import zipfile
from functools import lru_cache
from typing import Iterable
from xml.etree.ElementTree import Element  # nosec

import requests
from defusedxml import ElementTree

from bluesearch.database.article import JATSXMLParser, get_arxiv_id

logger = logging.getLogger(__name__)


# Journal Topic
[docs]@lru_cache(maxsize=None) def request_mesh_from_nlm_ta(nlm_ta: str) -> list[dict] | None: """Retrieve Medical Subject Heading from Journal's NLM Title Abbreviation. Parameters ---------- nlm_ta NLM Title Abbreviation of Journal. Returns ------- meshs List containing all meshs of the Journal. References ---------- https://www.ncbi.nlm.nih.gov/books/NBK3799/#catalog.Title_Abbreviation_ta """ if "&" in nlm_ta: logger.error( "Ampersands not allowed in the NLM title abbreviation. " f"Try unescaping HTML characters first. Got:\n{nlm_ta}" ) return None # The "format=text" parameter only matters when no result was found. With # this parameter the returned text will be an empty string. See the # corresponding check further below. Without this parameter the output is # an HTML page, which is impossible to parse. base_url = "https://www.ncbi.nlm.nih.gov/nlmcatalog" params = {"term": f'"{nlm_ta}"[ta]', "report": "xml", "format": "text"} response = requests.get(base_url, params=params) response.raise_for_status() # The way NCBI responds to these queries is weird: it takes the XML file, # escapes all XML tags and wraps it into a pair of <pre> tag inside an HTML # response with a fixed header # So we need to check if the response is in exactly this form, strip away the # HTML part, and unescape the XML tags. text = response.text.strip() header = ( '<?xml version="1.0" encoding="utf-8"?>\n' '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ' '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' "<pre>" ) footer = "</pre>" if not text.startswith(header) or not text.endswith(footer): logger.error(f"Unexpected response for parameters \n{params}") return None text = html.unescape(text[len(header) - 5 :]).strip() # Empty text means topic abbreviation was not found. See comment about the # parameter "format=text" above. if text == "<pre></pre>": logger.error(f"Empty body for parameters \n{params}") return None content = ElementTree.fromstring(text) mesh_headings = content.findall( "./NCBICatalogRecord/NLMCatalogRecord/MeshHeadingList/MeshHeading" ) return _parse_mesh_from_nlm_catalog(mesh_headings)
# Article Topic
[docs]def request_mesh_from_pubmed_id(pubmed_ids: Iterable[str]) -> dict: """Retrieve Medical Subject Headings from Pubmed ID. Parameters ---------- pubmed_ids List of Pubmed IDs. Returns ------- pubmed_to_meshs : dict Dictionary containing Pubmed IDs as keys with corresponding Medical Subject Headings list as values. References ---------- https://dataguide.nlm.nih.gov/eutilities/utilities.html#efetch """ pubmed_str = ",".join(pubmed_ids) url = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" f"db=pubmed&id={pubmed_str}&retmode=xml" ) response = requests.get(url) if not response.ok: response.raise_for_status() content = ElementTree.fromstring(response.content.decode()) pubmed_articles = content.findall("./PubmedArticle") pubmed_to_meshs = {} for article in pubmed_articles: pubmed_id_tag = article.find( "./PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']" ) if pubmed_id_tag is None: continue pubmed_id = pubmed_id_tag.text mesh_headings = article.findall("./MedlineCitation/MeshHeadingList") meshs = _parse_mesh_from_pubmed(mesh_headings) pubmed_to_meshs[pubmed_id] = meshs return pubmed_to_meshs
# Utils
[docs]def extract_pubmed_id_from_pmc_file(path: str | pathlib.Path) -> str | None: """Retrieve Pubmed ID from PMC XML file. Parameters ---------- path Path to PMC XML. Returns ------- pubmed_id : str Pubmed ID of the given article """ content = ElementTree.parse(path) pmid_tag = content.find("./front/article-meta/article-id[@pub-id-type='pmid']") if pmid_tag is None: return None else: return pmid_tag.text
def _parse_mesh_from_nlm_catalog(mesh_headings: Iterable[Element]) -> list[dict]: """Retrieve Medical Subject Headings from nlmcatalog parsing. Parameters ---------- mesh_headings XML parsing element containing all Medical Subject Headings. Returns ------- mesh : list of dict List of dictionary containing Medical Subject Headings information. """ meshs = [] for mesh in mesh_headings: mesh_id = mesh.attrib.get("URI", None) if mesh_id is not None: *_, mesh_id = mesh_id.rpartition("/") descriptor_name = [] qualifier_name = [] for elem in mesh: major_topic = elem.get("MajorTopicYN") == "Y" name = elem.text if name is not None: name = html.unescape(name) if elem.tag == "DescriptorName": descriptor_name.append( {"name": name, "major_topic": major_topic, "ID": mesh_id} ) else: qualifier_name.append({"name": name, "major_topic": major_topic}) meshs.append({"descriptor": descriptor_name, "qualifiers": qualifier_name}) return meshs def _parse_mesh_from_pubmed(mesh_headings: Iterable[Element]) -> list[dict]: """Retrieve Medical Subject Headings from efetch pubmed parsing. Parameters ---------- mesh_headings XML parsing element containing all Medical Subject Headings. Returns ------- mesh : list of dict List of dictionary containing Medical Subject Headings information. """ meshs = [] for mesh_heading in mesh_headings: for mesh in list(mesh_heading): descriptor_name = [] qualifiers_name = [] for info in list(mesh): attributes = info.attrib mesh_id = attributes.get("UI", None) if mesh_id is not None: *_, mesh_id = mesh_id.rpartition("/") major_topic = None if "MajorTopicYN" in attributes: major_topic = attributes["MajorTopicYN"] == "Y" if info.tag == "DescriptorName": descriptor_name.append( {"ID": mesh_id, "major_topic": major_topic, "name": info.text} ) else: qualifiers_name.append( {"ID": mesh_id, "major_topic": major_topic, "name": info.text} ) meshs.append({"descriptor": descriptor_name, "qualifiers": qualifiers_name}) return meshs
[docs]def get_topics_for_pmc_article( pmc_path: pathlib.Path | str, ) -> list[str] | None: """Extract journal topics of a PMC article. Parameters ---------- pmc_path Path to the PMC article to consider Returns ------- journal_topics : list[str] | None Journal topics for the given article. """ # Determine journal title *_, extension = str(pmc_path).lower().rpartition(".") if extension == "xml": parser = JATSXMLParser.from_xml(pmc_path) elif extension == "meca" or extension == "zip": parser = JATSXMLParser.from_zip(pmc_path) else: raise ValueError( f"Unknown file extension for JATS XML: {extension!r}. Only XML and ZIP " f"are supported. File: {str(pmc_path)!r}" ) nlm_ta = parser.content.find( "./front/journal-meta/journal-id[@journal-id-type='nlm-ta']" ) if nlm_ta is None: logger.error(f"No NLM Title Abbreviation found for {pmc_path}") return None nlm_ta = nlm_ta.text logger.info(f"Journal Title Abbreviation: {nlm_ta}") journal_meshes = request_mesh_from_nlm_ta(nlm_ta) if journal_meshes is None: return None journal_topics = [] for mesh in journal_meshes: for descriptor in mesh["descriptor"]: journal_topics.append(descriptor["name"]) return journal_topics
[docs]def get_topics_for_arxiv_articles( arxiv_paths: Iterable[pathlib.Path | str], batch_size: int = 400 ) -> dict[pathlib.Path, list[str]]: """Extract journal topics of one or more arXiv article. Parameters ---------- arxiv_paths Full paths to the arXiv articles to consider. batch_size Metadata are retrieved using the arXiv API [1] in batches of size `batch_size`. Large batches values may create long request URLs that cause the arXiv API to fail. Returns ------- article_topics : dict[pathlib.Path , list[str]] Maps each of the paths to a list of corresponding arXiv article topics. See [2] for an explanation of arXiv topics taxonomy. Raises ------ ValueError If the arXiv API does not return the correct number of metadata. References ---------- [1] https://arxiv.org/help/api/user-manual [2] https://arxiv.org/category_taxonomy """ # Get arXiv IDs of interest. id_2_path: dict[str, pathlib.Path] = {} for p in arxiv_paths: try: arxiv_id = get_arxiv_id(p, with_prefix=False) id_2_path[arxiv_id] = pathlib.Path(p) except ValueError as ve: logger.error(f"Failed ID extraction: {ve}") # Retrieve metadata in batches ns = { "atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom", "opensearch": "http://a9.com/-/spec/opensearch/1.1/", } base_url = "http://export.arxiv.org/api/query" id_pattern = re.compile(r"http://arxiv.org/abs/(.*)") ids = list(id_2_path.keys()) article_topics = {} for i_start in range(0, len(ids), batch_size): # Get a slice of arXiv ids i_end = i_start + batch_size id_list = ids[i_start:i_end] # Send request to arXiv API for our slice of arXiv ids params = { "id_list": ",".join(id_list), "max_results": str(batch_size), } res = requests.get(base_url, params) res.raise_for_status() # Process response to retrieve arXiv ids and corresponding topics etree = ElementTree.fromstring(res.text) entries = etree.findall("./atom:entry", ns) if len(entries) != len(id_list): raise ValueError( f"Expected to find {len(id_list)} metadata, " f"but found {len(entries)}, for id_list = {id_list}" ) for el in entries: atom_id = el.find("./atom:id", ns) match = id_pattern.fullmatch(atom_id.text) if match is None: raise ValueError(f"Could not extract ID from {atom_id}") else: id_ = match.group(1) categories = [ categ.get("term") for categ in el.findall("atom:category", ns) ] article_topics[id_2_path[id_]] = categories return article_topics
[docs]def extract_article_topics_from_medrxiv_article( path: pathlib.Path | str, ) -> tuple[str, str]: """Extract topic of a medRxiv/bioRxiv article. The `.meca` file should always have a fixed structure. Namely, there is a folder `content` and inside of it there should be a single `.xml` file containing the text and the metadata of the article. Parameters ---------- path Path to a `.meca` file (which is nothing else than a zip archive) with a fixed structured. Returns ------- topic : str The subject area of the article. journal : str The journal the article was published in. Should be either "medRxiv" or "bioRxiv". Raises ------ ValueError Appropriate XML not found or the journal or topic are missing. """ path = pathlib.Path(path) with zipfile.ZipFile(path) as myzip: xml_files = [ x for x in myzip.namelist() if x.startswith("content/") and x.endswith(".xml") ] if len(xml_files) != 1: raise ValueError( "There needs to be exactly one .xml file inside of content/" ) xml_file = xml_files[0] # Parsing logic with myzip.open(xml_file, "r") as f: content = ElementTree.parse(f) journal_element = content.find( "./front/journal-meta/journal-title-group/journal-title" ) topic_element = content.find( "./front/article-meta/article-categories/subj-group[@subj-group-type='hwp-journal-coll']/subject" # noqa ) if topic_element is None: raise ValueError("No topic found") if journal_element is None: raise ValueError("No journal found") topic = topic_element.text journal = journal_element.text return topic, journal
[docs]def extract_article_topics_for_pubmed_article( xml_article: Element, ) -> list[str] | None: """Extract article topics of a PubMed article. Parameters ---------- xml_article XML parse of an article for which to extract journal and article topics. Returns ------- article_topics : list[str] | None Article topics extracted for the given article. """ mesh_headings = xml_article.findall("./MedlineCitation/MeshHeadingList") article_meshes = _parse_mesh_from_pubmed(mesh_headings) article_topics = [ desc["name"] for mesh in article_meshes for desc in mesh["descriptor"] ] return article_topics
[docs]def extract_journal_topics_for_pubmed_article( xml_article: Element, ) -> list[str] | None: """Extract journal topics of a PubMed article. Parameters ---------- xml_article XML parse of an article for which to extract journal and article topics. Returns ------- journal_topics : list[str] | None Journal topics extracted for the given article. """ # Journal topic medline_ta = xml_article.find("./MedlineCitation/MedlineJournalInfo/MedlineTA") if medline_ta is None or medline_ta.text is None: return None journal_meshes = request_mesh_from_nlm_ta(medline_ta.text) if journal_meshes is None: return None journal_topics = [ desc["name"] for mesh in journal_meshes for desc in mesh["descriptor"] ] return journal_topics