Source code for bluesearch.database.article

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Abstraction of scientific article data and related tools."""
from __future__ import annotations

import enum
import hashlib
import html
import logging
import re
import string
import unicodedata
from abc import ABC, abstractmethod
from dataclasses import dataclass
from io import StringIO
from pathlib import Path
from typing import IO, Generator, Iterable, Optional, Sequence, Tuple
from xml.etree.ElementTree import Element  # nosec
from zipfile import ZipFile

from defusedxml import ElementTree
from mashumaro.mixins.json import DataClassJSONMixin

logger = logging.getLogger(__name__)


[docs]class ArticleSource(enum.Enum): """The source of an article.""" ARXIV = "arxiv" BIORXIV = "biorxiv" MEDRXIV = "medrxiv" PMC = "pmc" PUBMED = "pubmed" UNKNOWN = "unknown"
[docs]def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str: """Compute arXiv ID, including version, from file path. Parameters ---------- path The file path to an arXiv article. with_prefix If `True`, the returned arXiv ID will include the prefix "arxiv:". Returns ------- str The computed arXiv ID. Raises ------ ValueError If no valid arXiv ID could be inferred from the file path. References ---------- https://arxiv.org/help/arxiv_identifier """ path = Path(path) prefix = "arxiv:" if with_prefix else "" # New format, since 2007-04, only needs path stem: # - since 2015-01 have format YYMM.NNNNN (i.e. 5 digits) # - up to 2014-12 have format YYMM.NNNN (i.e. 4 digits) pattern = re.compile(r"\d{4}\.\d{4}\d?v\d+") match = pattern.fullmatch(path.stem) if match: return f"{prefix}{match.string}" # Old format, up to 2007-03, needs to look at the whole path: # - some_path/arxiv/<archive>/<format>/YYMM/YYMMNNNv<version>.<ext> # Note: format may contain "-" pattern = re.compile(r"arxiv/([\w-]+)/\w+/\d{4}/(\d{7}v\d+)\.\w+\Z") match = pattern.search("/".join(path.parts[-5:])) if match: cat, id_ = match.groups() return f"{prefix}{cat}/{id_}" raise ValueError(f"Could not extract arXiv ID from file path {path}")
[docs]class ArticleParser(ABC): """An abstract base class for article parsers.""" @property @abstractmethod def title(self) -> str: """Get the article title. Returns ------- str The article title. """ @property @abstractmethod def authors(self) -> Iterable[str]: """Get all author names. Returns ------- iterable of str All authors. """ @property @abstractmethod def abstract(self) -> Iterable[str]: """Get a sequence of paragraphs in the article abstract. Returns ------- iterable of str The paragraphs of the article abstract. """ @property @abstractmethod def paragraphs(self) -> Iterable[tuple[str, str]]: """Get all paragraphs and titles of sections they are part of. Returns ------- iterable of (str, str) For each paragraph a tuple with two strings is returned. The first is the section title, the second the paragraph content. """ @property def pubmed_id(self) -> str | None: """Get Pubmed ID. Returns ------- str or None Pubmed ID if specified, otherwise None. """ return None @property def pmc_id(self) -> str | None: """Get PMC ID. Returns ------- str or None PMC ID if specified, otherwise None. """ return None @property def arxiv_id(self) -> str | None: """Get arXiv ID. Returns ------- str or None arXiv ID if specified, otherwise None. """ return None @property def doi(self) -> str | None: """Get DOI. Returns ------- str or None DOI if specified, otherwise None. """ return None
[docs] @staticmethod def get_uid_from_identifiers(identifiers: tuple[str | None, ...]) -> str: """Generate a deterministic UID for a list of given paper identifiers. Papers with the same values for the given identifiers get the same UID. Missing values should have the value `None`, which is considered a value by itself. Then, identifiers `(a, None)` and identifiers `(a, b)` have two different UIDs. Parameters ---------- identifiers Values of the identifiers. Returns ------- str A deterministic UID computed from the identifiers. Raises ------ ValueError If all identifiers are `None`. """ if all(x is None for x in identifiers): raise ValueError( f"Identifiers = {identifiers} are all `None`, UID cannot be computed." ) else: data = str(identifiers).encode() hashed = hashlib.md5(data).hexdigest() # nosec return hashed
@property def uid(self) -> str: """Generate deterministic UID for an article. The UID is usually created by hashing the identifiers of the article. If no identifier is available, then the unique ID is computed by hashing the whole content of the article. Returns ------- str A deterministic UID. """ identifiers = (self.pubmed_id, self.pmc_id, self.arxiv_id, self.doi) # If no identifier is available, hash whole article content. if all(x is None for x in identifiers): logger.warning( f"No identifier available, generating UID by hashing whole " f'content for article "{self.title}"' ) m = hashlib.md5() # nosec m.update(self.title.encode()) m.update(str(list(self.authors)).encode()) m.update(str(list(self.abstract)).encode()) m.update(str(list(self.paragraphs)).encode()) return m.hexdigest() # If at least one identifier is available, hash identifiers. else: return self.get_uid_from_identifiers(identifiers)
[docs]class JATSXMLParser(ArticleParser): """Parser for JATS XML files. This could be used for articles from PubMed Central, bioRxiv, and medRxiv. Parameters ---------- xml_stream The xml stream of the article. """ def __init__(self, xml_stream: IO) -> None: super().__init__() self.content = ElementTree.parse(xml_stream) self.ids = self.get_ids()
[docs] @classmethod def from_string(cls, xml_string: str) -> JATSXMLParser: """Read xml string and instantiate JATSXML Parser. Parameters ---------- xml_string Raw content of the article Returns ------- JATSXMLParser Parser containing the article content. """ with StringIO(xml_string) as stream: obj = cls(stream) return obj
[docs] @classmethod def from_xml(cls, path: str | Path) -> JATSXMLParser: """Read xml file and instantiate JATSXML Parser. Parameters ---------- path Path to the article (with .xml extension) Returns ------- JATSXMLParser Parser containing the article content. """ with open(path) as fh: obj = cls(fh) return obj
[docs] @classmethod def from_zip(cls, path: str | Path) -> JATSXMLParser: """Read xml file from a zipped .meca folder and instantiate JATSXML Parser. Parameters ---------- path Path to the article (with .meca extension) Returns ------- JATSXMLParser Parser containing the article content. """ with ZipFile(path) as myzip: xml_files = [ x for x in myzip.namelist() if x.startswith("content/") and x.endswith(".xml") ] if len(xml_files) != 1: raise ValueError( "There needs to be exactly one .xml file inside of content/" ) xml_file = xml_files[0] # Parsing logic with myzip.open(xml_file, "r") as fh: obj = cls(fh) return obj
@property def title(self) -> str: """Get the article title. Returns ------- str The article title. """ titles = self.content.find("./front/article-meta/title-group/article-title") return self._element_to_str(titles) @property def authors(self) -> Generator[str, None, None]: """Get all author names. Yields ------ str Every author, in the format "Given_Name(s) Surname". """ authors = self.content.findall( "./front/article-meta/contrib-group/contrib[@contrib-type='author']" ) for author in authors: given_names = self._element_to_str(author.find("name/given-names")) surname = self._element_to_str(author.find("name/surname")) if given_names == "" or surname == "": # In rare cases, an author may not have a given name or a surname, # e.g. it could be an organization. We decide to skip those. continue author_str = given_names + " " + surname yield author_str.strip() @property def abstract(self) -> Generator[str, None, None]: """Get a sequence of paragraphs in the article abstract. Yields ------ str The paragraphs of the article abstract. """ abstract = self.content.find("./front/article-meta/abstract") if abstract: for _, text in self.parse_section(abstract): yield text @property def paragraphs(self) -> Generator[tuple[str, str], None, None]: """Get all paragraphs and titles of sections they are part of. Paragraphs can be parts of text body, or figure or table captions. Yields ------ section : str The section title. text : str The paragraph content. """ # Paragraphs of text body body = self.content.find("./body") if body: yield from self.parse_section(body) # Figure captions figs = self.content.findall("./body//fig") for fig in figs: fig_captions = fig.findall("caption") if fig_captions is None: continue caption = " ".join(self._element_to_str(c) for c in list(fig_captions)) if caption: yield "Figure Caption", caption # Table captions tables = self.content.findall("./body//table-wrap") for table in tables: caption_elements = table.findall("./caption/p") or table.findall( "./caption/title" ) if caption_elements is None: continue caption = " ".join(self._element_to_str(c) for c in caption_elements) if caption: yield "Table Caption", caption @property def pubmed_id(self) -> str | None: """Get Pubmed ID. Returns ------- str or None Pubmed ID if specified, otherwise None. """ return self.ids.get("pmid") @property def pmc_id(self) -> str | None: """Get PMC ID. Returns ------- str or None PMC ID if specified, otherwise None. """ return self.ids.get("pmc") @property def doi(self) -> str | None: """Get DOI. Returns ------- str or None DOI if specified, otherwise None. """ return self.ids.get("doi")
[docs] def get_ids(self) -> dict[str, str]: """Get all specified IDs of the paper. Returns ------- ids : dict Dictionary whose keys are ids type and value are ids values. """ ids = {} article_ids = self.content.findall("./front/article-meta/article-id") for article_id in article_ids: if "pub-id-type" not in article_id.attrib.keys(): continue ids[article_id.attrib["pub-id-type"]] = article_id.text return ids
[docs] def parse_section(self, section: Element) -> Generator[tuple[str, str], None, None]: """Parse section children depending on the tag. Parameters ---------- section The input XML element. Returns ------- str The section title. str A parsed string representation of the input XML element. """ sec_title = self._element_to_str(section.find("title")) for element in section: if element.tag == "sec": yield from self.parse_section(element) elif element.tag in {"title", "caption", "fig", "table-wrap"}: continue else: text = self._element_to_str(element) if text: yield sec_title, text
def _inner_text(self, element: Element) -> str: """Convert all inner text and sub-elements to one string. In short, we collect all the inner text while also converting all sub-elements that we encounter to strings using ``self._element_to_str``. All escaped HTML in the raw text is unescaped. For example, if schematically the element is given by element = "<p>I <bold>like</bold> python &amp; ice cream.<p>" then ``_inner_text(element)`` would give "I like python & ice cream." provided that "<bold>like</bold>" is resolved to "like" by the ``self._element_to_str`` method. Parameters ---------- element The input XML element. Returns ------- str The inner text and sub-elements converted to one single string. """ text_parts = [html.unescape(element.text or "")] for sub_element in element: # recursively parse the sub-element text_parts.append(self._element_to_str(sub_element)) # don't forget the text after the sub-element text_parts.append(html.unescape(sub_element.tail or "")) return unicodedata.normalize("NFKC", "".join(text_parts)).strip() def _element_to_str(self, element: Element | None) -> str: """Convert an element and all its contents to a string. Parameters ---------- element The input XML element. Returns ------- str A parsed string representation of the input XML element. """ if element is None: return "" if element.tag in { "bold", "italic", "monospace", "p", "sc", "styled-content", "underline", "xref", }: # Mostly styling tags for which getting the inner text is enough. # Currently this is the same as the default handling. Writing it out # explicitly here to decouple from the default handling, which may # change in the future. return self._inner_text(element) elif element.tag == "sub": return f"_{self._inner_text(element)}" elif element.tag == "sup": return f"^{self._inner_text(element)}" elif element.tag in { "disp-formula", "email", "ext-link", "inline-formula", "uri", }: return "" else: # Default handling for all other element tags return self._inner_text(element)
[docs]class PubMedXMLParser(ArticleParser): """Parser for PubMed abstract.""" def __init__(self, data: Element | Path | str) -> None: super().__init__() self.content: ElementTree if isinstance(data, Element): self.content = data else: self.content = ElementTree.parse(str(data)) @property def title(self) -> str: """Get the article title. Returns ------- str The article title. """ title = self.content.find("./MedlineCitation/Article/ArticleTitle") return title.text @property def authors(self) -> Iterable[str]: """Get all author names. Returns ------- iterable of str All authors. """ authors = self.content.find("./MedlineCitation/Article/AuthorList") if authors is None: # No author to parse: stop and return an empty iterable. return () for author in authors: # Author entries with 'ValidYN' == 'N' are incorrect entries: # https://dtd.nlm.nih.gov/ncbi/pubmed/doc/out/190101/att-ValidYN.html. if author.get("ValidYN") == "Y": # 'LastName' is a required field if there is no 'CollectiveName'. lastname = author.find("LastName") # 'ForeName' is an optional field only used with 'LastName'. forenames = author.find("ForeName") parts = (forenames, lastname) name = [x.text for x in parts if x is not None] if len(name) > 0: yield " ".join(name) @property def abstract(self) -> Iterable[str]: """Get a sequence of paragraphs in the article abstract. Returns ------- iterable of str The paragraphs of the article abstract. """ paragraphs = self.content.find("./MedlineCitation/Article/Abstract") if paragraphs is None: # No paragraphs to parse: stop and return an empty iterable. return () for paragraph in paragraphs.iter("AbstractText"): yield paragraph.text @property def paragraphs(self) -> Iterable[tuple[str, str]]: """Get all paragraphs and titles of sections they are part of. Returns ------- iterable of (str, str) For each paragraph a tuple with two strings is returned. The first is the section title, the second the paragraph content. """ # No paragraph to parse in PubMed article sets: return an empty iterable. return () @property def pubmed_id(self) -> str | None: """Get Pubmed ID. Returns ------- str or None Pubmed ID if specified, otherwise None. """ pubmed_id = self.content.find("./MedlineCitation/PMID") return pubmed_id.text @property def pmc_id(self) -> str | None: """Get PMC ID. Returns ------- str or None PMC ID if specified, otherwise None. """ pmc_id = self.content.find( "./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" ) return None if pmc_id is None else pmc_id.text @property def doi(self) -> str | None: """Get DOI. Returns ------- str or None DOI if specified, otherwise None. """ doi = self.content.find("./PubmedData/ArticleIdList/ArticleId[@IdType='doi']") return None if doi is None else doi.text
[docs]class CORD19ArticleParser(ArticleParser): """Parser for CORD-19 JSON files. Parameters ---------- json_file The contents of a JSON-file from the CORD-19 database. """ def __init__(self, json_file: dict) -> None: # data is a reference to json_file, so we shouldn't modify its contents self.data = json_file # Check top-level keys # the spec also includes "abstract" but it's missing from the PMC parses top_level_keys = { "paper_id", "metadata", "body_text", "bib_entries", "ref_entries", "back_matter", } if not top_level_keys.issubset(json_file.keys()): raise ValueError( "Incomplete JSON file. Missing keys: " f"{top_level_keys - set(json_file.keys())}" ) @property def title(self) -> str: """Get the article title. Returns ------- str The article title. """ return self.data["metadata"]["title"] @property def authors(self) -> Generator[str, None, None]: """Get all author names. Yields ------ str Every author. """ for author in self.data["metadata"]["authors"]: author_str = " ".join( filter( lambda part: part != "", ( author["first"], " ".join(author["middle"]), author["last"], author["suffix"], ), ) ) yield author_str @property def abstract(self) -> list[str]: """Get a sequence of paragraphs in the article abstract. Returns ------- list of str The paragraphs of the article abstract. """ if "abstract" not in self.data: return [] return [paragraph["text"] for paragraph in self.data["abstract"]] @property def paragraphs(self) -> Generator[tuple[str, str], None, None]: """Get all paragraphs and titles of sections they are part of. Yields ------ str The section title. str The paragraph content. """ for paragraph in self.data["body_text"]: yield paragraph["section"], paragraph["text"] # We've always included figure/table captions like this for ref_entry in self.data["ref_entries"].values(): yield "Caption", ref_entry["text"] @property def pmc_id(self) -> str | None: """Get PMC ID. Returns ------- str or None PMC ID if specified, otherwise None. """ return self.data.get("paper_id") def __str__(self): """Get the string representation of the parser instance.""" return f'CORD-19 article ID={self.data["paper_id"]}'
[docs]class TEIXMLParser(ArticleParser): """Parser for TEI XML files. Parameters ---------- path The path to a TEI XML file. is_arxiv Set to `True` if the TEI XML file was generated by parsing an arXiv PDF. """ def __init__(self, path: str | Path, is_arxiv: bool | None = False): path = Path(path) with path.open() as fp: self.content = ElementTree.fromstring(fp.read()) self.tei_namespace = {"tei": "http://www.tei-c.org/ns/1.0"} self._tei_ids: dict[str, str] | None = None self._arxiv_id = get_arxiv_id(path) if is_arxiv else None @property def title(self) -> str: """Get the article title. Returns ------- str The article title. """ title = self.content.find( "./tei:teiHeader/tei:fileDesc/tei:titleStmt/", self.tei_namespace ) return self._element_to_str(title) @property def authors(self) -> Generator[str, None, None]: """Get all author names. Yields ------ str Every author, in the format "Given_Name(s) Surname". """ for pers_name in self.content.findall( "./tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic" "/tei:author/tei:persName", self.tei_namespace, ): parts = [ pers_name.find("./tei:forename[@type='first']", self.tei_namespace), pers_name.find("./tei:forename[@type='middle']", self.tei_namespace), pers_name.find("./tei:surname", self.tei_namespace), ] parts = [self._element_to_str(part) for part in parts] yield " ".join([part for part in parts if part]).strip() @property def abstract(self) -> Generator[str, None, None]: """Get a sequence of paragraphs in the article abstract. Yields ------ str The paragraphs of the article abstract. """ for div in self.content.findall( "./tei:teiHeader/tei:profileDesc/tei:abstract/tei:div", self.tei_namespace, ): yield from self._build_texts(div) @property def paragraphs(self) -> Generator[tuple[str, str], None, None]: """Get all paragraphs and titles of sections they are part of. Paragraphs can be parts of text body, or figure or table captions. Yields ------ section_title : str The section title. text : str The paragraph content. """ for div in self.content.findall( "./tei:text/tei:body/tei:div", self.tei_namespace, ): head = div.find("./tei:head", self.tei_namespace) section_title = self._element_to_str(head) text_elements = [] for child in div: if not child.tag.endswith("head"): text_elements.append(child) for text in self._build_texts(text_elements): yield section_title, text # Figure and Table Caption for figure in self.content.findall( "./tei:text/tei:body/tei:figure", self.tei_namespace ): caption = figure.find("./tei:figDesc", self.tei_namespace) caption_str = self._element_to_str(caption) if not caption_str: continue if figure.get("type") == "table": yield "Table Caption", caption_str else: yield "Figure Caption", caption_str @property def arxiv_id(self) -> str | None: """Get arXiv ID. Returns ------- str or None arXiv ID if specified, otherwise None. """ return self._arxiv_id @property def doi(self) -> str | None: """Get DOI. Returns ------- str or None DOI if specified, otherwise None. """ return self.tei_ids.get("DOI") @property def tei_ids(self) -> dict: """Extract all IDs of the TEI XML. Returns ------- dict Dictionary containing all the IDs of the TEI XML content with the key being the ID type and the value being the ID value. """ if self._tei_ids is None: self._tei_ids = {} for idno in self.content.findall( "./tei:teiHeader/tei:fileDesc/tei:sourceDesc" "/tei:biblStruct/tei:idno", self.tei_namespace, ): id_type = idno.get("type") self._tei_ids[id_type] = idno.text return self._tei_ids @staticmethod def _element_to_str(element: Element | None) -> str: """Convert an element and all its contents to a string. Parameters ---------- element The input XML element. Returns ------- str A parsed string representation of the input XML element. """ if element is None: return "" return "".join(element.itertext()) def _build_texts(self, elements: Iterable[Element]) -> Generator[str, None, None]: """Compose paragraphs and formulas to meaningful texts. In the abstract and main text of TEI XML parsers one finds a mix of <p> and <formula> tags. Several of these tags could be part of one sentence. This method tries to reconstruct sentences that are partitioned in this way. The formulas are replaced by the FORMULA placeholder. Parameters ---------- elements An iterable of <p> and <formula> elements. Yields ------ str One or more sentences as one string. Raises ------ RuntimeError If a tag is encountered that is neither <p> nor <formula>. """ # In TEI XML all tags are prefixed with the namespace. ns = self.tei_namespace["tei"] prefix = f"{{{ns}}}" if ns else "" # At every change ensure that there's no space at the end of text text = "" def if_non_empty(text_: str) -> Generator[str, None, None]: """Yield if text is non-empty and make sure it ends with a period.""" if text_: if not text_.endswith("."): text_ += "." yield text_ for child in elements: if child.tag == prefix + "p": p_text = self._element_to_str(child).strip() if not p_text: continue if p_text[0] in string.ascii_uppercase: # The sentence in the text has finished. # Yield and start a new one yield from if_non_empty(text) text = p_text else: # The sentence in the text continues text += " " + p_text elif child.tag == prefix + "formula": # Maybe use FORMULA-BLOCK instead? text += " FORMULA" else: all_text = "".join(self._element_to_str(e) for e in elements) raise RuntimeError( f"Unexpected tag: {child.tag}\nall text:\n{all_text}" ) # Yield the last remaining text yield from if_non_empty(text)
[docs]@dataclass(frozen=True) class Article(DataClassJSONMixin): """Abstraction of a scientific article and its contents.""" title: str authors: Sequence[str] abstract: Sequence[str] section_paragraphs: Sequence[Tuple[str, str]] pubmed_id: Optional[str] = None pmc_id: Optional[str] = None arxiv_id: Optional[str] = None doi: Optional[str] = None uid: Optional[str] = None
[docs] @classmethod def parse(cls, parser: ArticleParser) -> Article: """Parse an article through a parser. Parameters ---------- parser An article parser instance. """ title = parser.title authors = tuple(parser.authors) abstract = tuple(parser.abstract) section_paragraphs = tuple(parser.paragraphs) pubmed_id = parser.pubmed_id pmc_id = parser.pmc_id arxiv_id = parser.arxiv_id doi = parser.doi uid = parser.uid return cls( title, authors, abstract, section_paragraphs, pubmed_id, pmc_id, arxiv_id, doi, uid, )
[docs] def iter_paragraphs( self, with_abstract: bool = False ) -> Generator[tuple[str, str], None, None]: """Iterate over all paragraphs in the article. Parameters ---------- with_abstract : bool If true the abstract paragraphs will be included at the beginning. Yields ------ str Section title of the section the paragraph is in. str The paragraph text. """ if with_abstract: for paragraph in self.abstract: yield "Abstract", paragraph yield from self.section_paragraphs
def __str__(self) -> str: """Get a short summary of the article statistics. Returns ------- str A summary of the article statistics. """ # Collection information on text/paragraph lengths abstract_length = sum(map(len, self.abstract)) section_lengths = {} for section_title, text in self.section_paragraphs: if section_title not in section_lengths: section_lengths[section_title] = 0 section_lengths[section_title] += len(text) main_text_length = sum(section_lengths.values()) all_text_length = abstract_length + main_text_length # Construct the return string info_str = ( f'Title : "{self.title}"\n' f'Authors : {", ".join(self.authors)}\n' f"Abstract : {len(self.abstract)} paragraph(s), " f"{abstract_length} characters\n" f"Sections : {len(section_lengths)} section(s) " f"{main_text_length} characters\n" ) for section in section_lengths: info_str += f"- {section}\n" info_str += f"Total text length : {all_text_length}\n" return info_str.strip()