# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Abstraction of scientific article data and related tools."""
from __future__ import annotations
import enum
import hashlib
import html
import logging
import re
import string
import unicodedata
from abc import ABC, abstractmethod
from dataclasses import dataclass
from io import StringIO
from pathlib import Path
from typing import IO, Generator, Iterable, Optional, Sequence, Tuple
from xml.etree.ElementTree import Element # nosec
from zipfile import ZipFile
from defusedxml import ElementTree
from mashumaro.mixins.json import DataClassJSONMixin
logger = logging.getLogger(__name__)
[docs]class ArticleSource(enum.Enum):
"""The source of an article."""
ARXIV = "arxiv"
BIORXIV = "biorxiv"
MEDRXIV = "medrxiv"
PMC = "pmc"
PUBMED = "pubmed"
UNKNOWN = "unknown"
[docs]def get_arxiv_id(path: str | Path, with_prefix: bool = True) -> str:
"""Compute arXiv ID, including version, from file path.
Parameters
----------
path
The file path to an arXiv article.
with_prefix
If `True`, the returned arXiv ID will include the prefix "arxiv:".
Returns
-------
str
The computed arXiv ID.
Raises
------
ValueError
If no valid arXiv ID could be inferred from the file path.
References
----------
https://arxiv.org/help/arxiv_identifier
"""
path = Path(path)
prefix = "arxiv:" if with_prefix else ""
# New format, since 2007-04, only needs path stem:
# - since 2015-01 have format YYMM.NNNNN (i.e. 5 digits)
# - up to 2014-12 have format YYMM.NNNN (i.e. 4 digits)
pattern = re.compile(r"\d{4}\.\d{4}\d?v\d+")
match = pattern.fullmatch(path.stem)
if match:
return f"{prefix}{match.string}"
# Old format, up to 2007-03, needs to look at the whole path:
# - some_path/arxiv/<archive>/<format>/YYMM/YYMMNNNv<version>.<ext>
# Note: format may contain "-"
pattern = re.compile(r"arxiv/([\w-]+)/\w+/\d{4}/(\d{7}v\d+)\.\w+\Z")
match = pattern.search("/".join(path.parts[-5:]))
if match:
cat, id_ = match.groups()
return f"{prefix}{cat}/{id_}"
raise ValueError(f"Could not extract arXiv ID from file path {path}")
[docs]class ArticleParser(ABC):
"""An abstract base class for article parsers."""
@property
@abstractmethod
def title(self) -> str:
"""Get the article title.
Returns
-------
str
The article title.
"""
@property
@abstractmethod
def authors(self) -> Iterable[str]:
"""Get all author names.
Returns
-------
iterable of str
All authors.
"""
@property
@abstractmethod
def abstract(self) -> Iterable[str]:
"""Get a sequence of paragraphs in the article abstract.
Returns
-------
iterable of str
The paragraphs of the article abstract.
"""
@property
@abstractmethod
def paragraphs(self) -> Iterable[tuple[str, str]]:
"""Get all paragraphs and titles of sections they are part of.
Returns
-------
iterable of (str, str)
For each paragraph a tuple with two strings is returned. The first
is the section title, the second the paragraph content.
"""
@property
def pubmed_id(self) -> str | None:
"""Get Pubmed ID.
Returns
-------
str or None
Pubmed ID if specified, otherwise None.
"""
return None
@property
def pmc_id(self) -> str | None:
"""Get PMC ID.
Returns
-------
str or None
PMC ID if specified, otherwise None.
"""
return None
@property
def arxiv_id(self) -> str | None:
"""Get arXiv ID.
Returns
-------
str or None
arXiv ID if specified, otherwise None.
"""
return None
@property
def doi(self) -> str | None:
"""Get DOI.
Returns
-------
str or None
DOI if specified, otherwise None.
"""
return None
[docs] @staticmethod
def get_uid_from_identifiers(identifiers: tuple[str | None, ...]) -> str:
"""Generate a deterministic UID for a list of given paper identifiers.
Papers with the same values for the given identifiers get the same UID.
Missing values should have the value `None`, which is considered a value
by itself. Then, identifiers `(a, None)` and identifiers `(a, b)` have
two different UIDs.
Parameters
----------
identifiers
Values of the identifiers.
Returns
-------
str
A deterministic UID computed from the identifiers.
Raises
------
ValueError
If all identifiers are `None`.
"""
if all(x is None for x in identifiers):
raise ValueError(
f"Identifiers = {identifiers} are all `None`, UID cannot be computed."
)
else:
data = str(identifiers).encode()
hashed = hashlib.md5(data).hexdigest() # nosec
return hashed
@property
def uid(self) -> str:
"""Generate deterministic UID for an article.
The UID is usually created by hashing the identifiers of the article.
If no identifier is available, then the unique ID is computed by hashing
the whole content of the article.
Returns
-------
str
A deterministic UID.
"""
identifiers = (self.pubmed_id, self.pmc_id, self.arxiv_id, self.doi)
# If no identifier is available, hash whole article content.
if all(x is None for x in identifiers):
logger.warning(
f"No identifier available, generating UID by hashing whole "
f'content for article "{self.title}"'
)
m = hashlib.md5() # nosec
m.update(self.title.encode())
m.update(str(list(self.authors)).encode())
m.update(str(list(self.abstract)).encode())
m.update(str(list(self.paragraphs)).encode())
return m.hexdigest()
# If at least one identifier is available, hash identifiers.
else:
return self.get_uid_from_identifiers(identifiers)
[docs]class JATSXMLParser(ArticleParser):
"""Parser for JATS XML files.
This could be used for articles from PubMed Central, bioRxiv, and medRxiv.
Parameters
----------
xml_stream
The xml stream of the article.
"""
def __init__(self, xml_stream: IO) -> None:
super().__init__()
self.content = ElementTree.parse(xml_stream)
self.ids = self.get_ids()
[docs] @classmethod
def from_string(cls, xml_string: str) -> JATSXMLParser:
"""Read xml string and instantiate JATSXML Parser.
Parameters
----------
xml_string
Raw content of the article
Returns
-------
JATSXMLParser
Parser containing the article content.
"""
with StringIO(xml_string) as stream:
obj = cls(stream)
return obj
[docs] @classmethod
def from_xml(cls, path: str | Path) -> JATSXMLParser:
"""Read xml file and instantiate JATSXML Parser.
Parameters
----------
path
Path to the article (with .xml extension)
Returns
-------
JATSXMLParser
Parser containing the article content.
"""
with open(path) as fh:
obj = cls(fh)
return obj
[docs] @classmethod
def from_zip(cls, path: str | Path) -> JATSXMLParser:
"""Read xml file from a zipped .meca folder and instantiate JATSXML Parser.
Parameters
----------
path
Path to the article (with .meca extension)
Returns
-------
JATSXMLParser
Parser containing the article content.
"""
with ZipFile(path) as myzip:
xml_files = [
x
for x in myzip.namelist()
if x.startswith("content/") and x.endswith(".xml")
]
if len(xml_files) != 1:
raise ValueError(
"There needs to be exactly one .xml file inside of content/"
)
xml_file = xml_files[0]
# Parsing logic
with myzip.open(xml_file, "r") as fh:
obj = cls(fh)
return obj
@property
def title(self) -> str:
"""Get the article title.
Returns
-------
str
The article title.
"""
titles = self.content.find("./front/article-meta/title-group/article-title")
return self._element_to_str(titles)
@property
def authors(self) -> Generator[str, None, None]:
"""Get all author names.
Yields
------
str
Every author, in the format "Given_Name(s) Surname".
"""
authors = self.content.findall(
"./front/article-meta/contrib-group/contrib[@contrib-type='author']"
)
for author in authors:
given_names = self._element_to_str(author.find("name/given-names"))
surname = self._element_to_str(author.find("name/surname"))
if given_names == "" or surname == "":
# In rare cases, an author may not have a given name or a surname,
# e.g. it could be an organization. We decide to skip those.
continue
author_str = given_names + " " + surname
yield author_str.strip()
@property
def abstract(self) -> Generator[str, None, None]:
"""Get a sequence of paragraphs in the article abstract.
Yields
------
str
The paragraphs of the article abstract.
"""
abstract = self.content.find("./front/article-meta/abstract")
if abstract:
for _, text in self.parse_section(abstract):
yield text
@property
def paragraphs(self) -> Generator[tuple[str, str], None, None]:
"""Get all paragraphs and titles of sections they are part of.
Paragraphs can be parts of text body, or figure or table captions.
Yields
------
section : str
The section title.
text : str
The paragraph content.
"""
# Paragraphs of text body
body = self.content.find("./body")
if body:
yield from self.parse_section(body)
# Figure captions
figs = self.content.findall("./body//fig")
for fig in figs:
fig_captions = fig.findall("caption")
if fig_captions is None:
continue
caption = " ".join(self._element_to_str(c) for c in list(fig_captions))
if caption:
yield "Figure Caption", caption
# Table captions
tables = self.content.findall("./body//table-wrap")
for table in tables:
caption_elements = table.findall("./caption/p") or table.findall(
"./caption/title"
)
if caption_elements is None:
continue
caption = " ".join(self._element_to_str(c) for c in caption_elements)
if caption:
yield "Table Caption", caption
@property
def pubmed_id(self) -> str | None:
"""Get Pubmed ID.
Returns
-------
str or None
Pubmed ID if specified, otherwise None.
"""
return self.ids.get("pmid")
@property
def pmc_id(self) -> str | None:
"""Get PMC ID.
Returns
-------
str or None
PMC ID if specified, otherwise None.
"""
return self.ids.get("pmc")
@property
def doi(self) -> str | None:
"""Get DOI.
Returns
-------
str or None
DOI if specified, otherwise None.
"""
return self.ids.get("doi")
[docs] def get_ids(self) -> dict[str, str]:
"""Get all specified IDs of the paper.
Returns
-------
ids : dict
Dictionary whose keys are ids type and value are ids values.
"""
ids = {}
article_ids = self.content.findall("./front/article-meta/article-id")
for article_id in article_ids:
if "pub-id-type" not in article_id.attrib.keys():
continue
ids[article_id.attrib["pub-id-type"]] = article_id.text
return ids
[docs] def parse_section(self, section: Element) -> Generator[tuple[str, str], None, None]:
"""Parse section children depending on the tag.
Parameters
----------
section
The input XML element.
Returns
-------
str
The section title.
str
A parsed string representation of the input XML element.
"""
sec_title = self._element_to_str(section.find("title"))
for element in section:
if element.tag == "sec":
yield from self.parse_section(element)
elif element.tag in {"title", "caption", "fig", "table-wrap"}:
continue
else:
text = self._element_to_str(element)
if text:
yield sec_title, text
def _inner_text(self, element: Element) -> str:
"""Convert all inner text and sub-elements to one string.
In short, we collect all the inner text while also converting all
sub-elements that we encounter to strings using ``self._element_to_str``.
All escaped HTML in the raw text is unescaped.
For example, if schematically the element is given by
element = "<p>I <bold>like</bold> python & ice cream.<p>"
then ``_inner_text(element)`` would give
"I like python & ice cream."
provided that "<bold>like</bold>" is resolved to "like" by the
``self._element_to_str`` method.
Parameters
----------
element
The input XML element.
Returns
-------
str
The inner text and sub-elements converted to one single string.
"""
text_parts = [html.unescape(element.text or "")]
for sub_element in element:
# recursively parse the sub-element
text_parts.append(self._element_to_str(sub_element))
# don't forget the text after the sub-element
text_parts.append(html.unescape(sub_element.tail or ""))
return unicodedata.normalize("NFKC", "".join(text_parts)).strip()
def _element_to_str(self, element: Element | None) -> str:
"""Convert an element and all its contents to a string.
Parameters
----------
element
The input XML element.
Returns
-------
str
A parsed string representation of the input XML element.
"""
if element is None:
return ""
if element.tag in {
"bold",
"italic",
"monospace",
"p",
"sc",
"styled-content",
"underline",
"xref",
}:
# Mostly styling tags for which getting the inner text is enough.
# Currently this is the same as the default handling. Writing it out
# explicitly here to decouple from the default handling, which may
# change in the future.
return self._inner_text(element)
elif element.tag == "sub":
return f"_{self._inner_text(element)}"
elif element.tag == "sup":
return f"^{self._inner_text(element)}"
elif element.tag in {
"disp-formula",
"email",
"ext-link",
"inline-formula",
"uri",
}:
return ""
else:
# Default handling for all other element tags
return self._inner_text(element)
[docs]class PubMedXMLParser(ArticleParser):
"""Parser for PubMed abstract."""
def __init__(self, data: Element | Path | str) -> None:
super().__init__()
self.content: ElementTree
if isinstance(data, Element):
self.content = data
else:
self.content = ElementTree.parse(str(data))
@property
def title(self) -> str:
"""Get the article title.
Returns
-------
str
The article title.
"""
title = self.content.find("./MedlineCitation/Article/ArticleTitle")
return title.text
@property
def authors(self) -> Iterable[str]:
"""Get all author names.
Returns
-------
iterable of str
All authors.
"""
authors = self.content.find("./MedlineCitation/Article/AuthorList")
if authors is None:
# No author to parse: stop and return an empty iterable.
return ()
for author in authors:
# Author entries with 'ValidYN' == 'N' are incorrect entries:
# https://dtd.nlm.nih.gov/ncbi/pubmed/doc/out/190101/att-ValidYN.html.
if author.get("ValidYN") == "Y":
# 'LastName' is a required field if there is no 'CollectiveName'.
lastname = author.find("LastName")
# 'ForeName' is an optional field only used with 'LastName'.
forenames = author.find("ForeName")
parts = (forenames, lastname)
name = [x.text for x in parts if x is not None]
if len(name) > 0:
yield " ".join(name)
@property
def abstract(self) -> Iterable[str]:
"""Get a sequence of paragraphs in the article abstract.
Returns
-------
iterable of str
The paragraphs of the article abstract.
"""
paragraphs = self.content.find("./MedlineCitation/Article/Abstract")
if paragraphs is None:
# No paragraphs to parse: stop and return an empty iterable.
return ()
for paragraph in paragraphs.iter("AbstractText"):
yield paragraph.text
@property
def paragraphs(self) -> Iterable[tuple[str, str]]:
"""Get all paragraphs and titles of sections they are part of.
Returns
-------
iterable of (str, str)
For each paragraph a tuple with two strings is returned. The first
is the section title, the second the paragraph content.
"""
# No paragraph to parse in PubMed article sets: return an empty iterable.
return ()
@property
def pubmed_id(self) -> str | None:
"""Get Pubmed ID.
Returns
-------
str or None
Pubmed ID if specified, otherwise None.
"""
pubmed_id = self.content.find("./MedlineCitation/PMID")
return pubmed_id.text
@property
def pmc_id(self) -> str | None:
"""Get PMC ID.
Returns
-------
str or None
PMC ID if specified, otherwise None.
"""
pmc_id = self.content.find(
"./PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"
)
return None if pmc_id is None else pmc_id.text
@property
def doi(self) -> str | None:
"""Get DOI.
Returns
-------
str or None
DOI if specified, otherwise None.
"""
doi = self.content.find("./PubmedData/ArticleIdList/ArticleId[@IdType='doi']")
return None if doi is None else doi.text
[docs]class CORD19ArticleParser(ArticleParser):
"""Parser for CORD-19 JSON files.
Parameters
----------
json_file
The contents of a JSON-file from the CORD-19 database.
"""
def __init__(self, json_file: dict) -> None:
# data is a reference to json_file, so we shouldn't modify its contents
self.data = json_file
# Check top-level keys
# the spec also includes "abstract" but it's missing from the PMC parses
top_level_keys = {
"paper_id",
"metadata",
"body_text",
"bib_entries",
"ref_entries",
"back_matter",
}
if not top_level_keys.issubset(json_file.keys()):
raise ValueError(
"Incomplete JSON file. Missing keys: "
f"{top_level_keys - set(json_file.keys())}"
)
@property
def title(self) -> str:
"""Get the article title.
Returns
-------
str
The article title.
"""
return self.data["metadata"]["title"]
@property
def authors(self) -> Generator[str, None, None]:
"""Get all author names.
Yields
------
str
Every author.
"""
for author in self.data["metadata"]["authors"]:
author_str = " ".join(
filter(
lambda part: part != "",
(
author["first"],
" ".join(author["middle"]),
author["last"],
author["suffix"],
),
)
)
yield author_str
@property
def abstract(self) -> list[str]:
"""Get a sequence of paragraphs in the article abstract.
Returns
-------
list of str
The paragraphs of the article abstract.
"""
if "abstract" not in self.data:
return []
return [paragraph["text"] for paragraph in self.data["abstract"]]
@property
def paragraphs(self) -> Generator[tuple[str, str], None, None]:
"""Get all paragraphs and titles of sections they are part of.
Yields
------
str
The section title.
str
The paragraph content.
"""
for paragraph in self.data["body_text"]:
yield paragraph["section"], paragraph["text"]
# We've always included figure/table captions like this
for ref_entry in self.data["ref_entries"].values():
yield "Caption", ref_entry["text"]
@property
def pmc_id(self) -> str | None:
"""Get PMC ID.
Returns
-------
str or None
PMC ID if specified, otherwise None.
"""
return self.data.get("paper_id")
def __str__(self):
"""Get the string representation of the parser instance."""
return f'CORD-19 article ID={self.data["paper_id"]}'
[docs]class TEIXMLParser(ArticleParser):
"""Parser for TEI XML files.
Parameters
----------
path
The path to a TEI XML file.
is_arxiv
Set to `True` if the TEI XML file was generated by parsing an arXiv PDF.
"""
def __init__(self, path: str | Path, is_arxiv: bool | None = False):
path = Path(path)
with path.open() as fp:
self.content = ElementTree.fromstring(fp.read())
self.tei_namespace = {"tei": "http://www.tei-c.org/ns/1.0"}
self._tei_ids: dict[str, str] | None = None
self._arxiv_id = get_arxiv_id(path) if is_arxiv else None
@property
def title(self) -> str:
"""Get the article title.
Returns
-------
str
The article title.
"""
title = self.content.find(
"./tei:teiHeader/tei:fileDesc/tei:titleStmt/", self.tei_namespace
)
return self._element_to_str(title)
@property
def authors(self) -> Generator[str, None, None]:
"""Get all author names.
Yields
------
str
Every author, in the format "Given_Name(s) Surname".
"""
for pers_name in self.content.findall(
"./tei:teiHeader/tei:fileDesc/tei:sourceDesc/tei:biblStruct/tei:analytic"
"/tei:author/tei:persName",
self.tei_namespace,
):
parts = [
pers_name.find("./tei:forename[@type='first']", self.tei_namespace),
pers_name.find("./tei:forename[@type='middle']", self.tei_namespace),
pers_name.find("./tei:surname", self.tei_namespace),
]
parts = [self._element_to_str(part) for part in parts]
yield " ".join([part for part in parts if part]).strip()
@property
def abstract(self) -> Generator[str, None, None]:
"""Get a sequence of paragraphs in the article abstract.
Yields
------
str
The paragraphs of the article abstract.
"""
for div in self.content.findall(
"./tei:teiHeader/tei:profileDesc/tei:abstract/tei:div",
self.tei_namespace,
):
yield from self._build_texts(div)
@property
def paragraphs(self) -> Generator[tuple[str, str], None, None]:
"""Get all paragraphs and titles of sections they are part of.
Paragraphs can be parts of text body, or figure or table captions.
Yields
------
section_title : str
The section title.
text : str
The paragraph content.
"""
for div in self.content.findall(
"./tei:text/tei:body/tei:div",
self.tei_namespace,
):
head = div.find("./tei:head", self.tei_namespace)
section_title = self._element_to_str(head)
text_elements = []
for child in div:
if not child.tag.endswith("head"):
text_elements.append(child)
for text in self._build_texts(text_elements):
yield section_title, text
# Figure and Table Caption
for figure in self.content.findall(
"./tei:text/tei:body/tei:figure", self.tei_namespace
):
caption = figure.find("./tei:figDesc", self.tei_namespace)
caption_str = self._element_to_str(caption)
if not caption_str:
continue
if figure.get("type") == "table":
yield "Table Caption", caption_str
else:
yield "Figure Caption", caption_str
@property
def arxiv_id(self) -> str | None:
"""Get arXiv ID.
Returns
-------
str or None
arXiv ID if specified, otherwise None.
"""
return self._arxiv_id
@property
def doi(self) -> str | None:
"""Get DOI.
Returns
-------
str or None
DOI if specified, otherwise None.
"""
return self.tei_ids.get("DOI")
@property
def tei_ids(self) -> dict:
"""Extract all IDs of the TEI XML.
Returns
-------
dict
Dictionary containing all the IDs of the TEI XML content
with the key being the ID type and the value being the ID value.
"""
if self._tei_ids is None:
self._tei_ids = {}
for idno in self.content.findall(
"./tei:teiHeader/tei:fileDesc/tei:sourceDesc"
"/tei:biblStruct/tei:idno",
self.tei_namespace,
):
id_type = idno.get("type")
self._tei_ids[id_type] = idno.text
return self._tei_ids
@staticmethod
def _element_to_str(element: Element | None) -> str:
"""Convert an element and all its contents to a string.
Parameters
----------
element
The input XML element.
Returns
-------
str
A parsed string representation of the input XML element.
"""
if element is None:
return ""
return "".join(element.itertext())
def _build_texts(self, elements: Iterable[Element]) -> Generator[str, None, None]:
"""Compose paragraphs and formulas to meaningful texts.
In the abstract and main text of TEI XML parsers one finds a mix of
<p> and <formula> tags. Several of these tags could be part of one
sentence. This method tries to reconstruct sentences that are
partitioned in this way. The formulas are replaced by the FORMULA
placeholder.
Parameters
----------
elements
An iterable of <p> and <formula> elements.
Yields
------
str
One or more sentences as one string.
Raises
------
RuntimeError
If a tag is encountered that is neither <p> nor <formula>.
"""
# In TEI XML all tags are prefixed with the namespace.
ns = self.tei_namespace["tei"]
prefix = f"{{{ns}}}" if ns else ""
# At every change ensure that there's no space at the end of text
text = ""
def if_non_empty(text_: str) -> Generator[str, None, None]:
"""Yield if text is non-empty and make sure it ends with a period."""
if text_:
if not text_.endswith("."):
text_ += "."
yield text_
for child in elements:
if child.tag == prefix + "p":
p_text = self._element_to_str(child).strip()
if not p_text:
continue
if p_text[0] in string.ascii_uppercase:
# The sentence in the text has finished.
# Yield and start a new one
yield from if_non_empty(text)
text = p_text
else:
# The sentence in the text continues
text += " " + p_text
elif child.tag == prefix + "formula":
# Maybe use FORMULA-BLOCK instead?
text += " FORMULA"
else:
all_text = "".join(self._element_to_str(e) for e in elements)
raise RuntimeError(
f"Unexpected tag: {child.tag}\nall text:\n{all_text}"
)
# Yield the last remaining text
yield from if_non_empty(text)
[docs]@dataclass(frozen=True)
class Article(DataClassJSONMixin):
"""Abstraction of a scientific article and its contents."""
title: str
authors: Sequence[str]
abstract: Sequence[str]
section_paragraphs: Sequence[Tuple[str, str]]
pubmed_id: Optional[str] = None
pmc_id: Optional[str] = None
arxiv_id: Optional[str] = None
doi: Optional[str] = None
uid: Optional[str] = None
[docs] @classmethod
def parse(cls, parser: ArticleParser) -> Article:
"""Parse an article through a parser.
Parameters
----------
parser
An article parser instance.
"""
title = parser.title
authors = tuple(parser.authors)
abstract = tuple(parser.abstract)
section_paragraphs = tuple(parser.paragraphs)
pubmed_id = parser.pubmed_id
pmc_id = parser.pmc_id
arxiv_id = parser.arxiv_id
doi = parser.doi
uid = parser.uid
return cls(
title,
authors,
abstract,
section_paragraphs,
pubmed_id,
pmc_id,
arxiv_id,
doi,
uid,
)
[docs] def iter_paragraphs(
self, with_abstract: bool = False
) -> Generator[tuple[str, str], None, None]:
"""Iterate over all paragraphs in the article.
Parameters
----------
with_abstract : bool
If true the abstract paragraphs will be included at the beginning.
Yields
------
str
Section title of the section the paragraph is in.
str
The paragraph text.
"""
if with_abstract:
for paragraph in self.abstract:
yield "Abstract", paragraph
yield from self.section_paragraphs
def __str__(self) -> str:
"""Get a short summary of the article statistics.
Returns
-------
str
A summary of the article statistics.
"""
# Collection information on text/paragraph lengths
abstract_length = sum(map(len, self.abstract))
section_lengths = {}
for section_title, text in self.section_paragraphs:
if section_title not in section_lengths:
section_lengths[section_title] = 0
section_lengths[section_title] += len(text)
main_text_length = sum(section_lengths.values())
all_text_length = abstract_length + main_text_length
# Construct the return string
info_str = (
f'Title : "{self.title}"\n'
f'Authors : {", ".join(self.authors)}\n'
f"Abstract : {len(self.abstract)} paragraph(s), "
f"{abstract_length} characters\n"
f"Sections : {len(section_lengths)} section(s) "
f"{main_text_length} characters\n"
)
for section in section_lengths:
info_str += f"- {section}\n"
info_str += f"Total text length : {all_text_length}\n"
return info_str.strip()