Source code for bluesearch.database.topic_info
# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020 Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the TopicInfo data structure."""
from __future__ import annotations
import copy
import datetime
import pathlib
from collections.abc import Iterable
from dataclasses import dataclass, field
from typing import Any
import bluesearch
from bluesearch.database.article import ArticleSource
[docs]@dataclass
class TopicInfo:
"""The topic information extracted from a journal article.
For the spec see the following GitHub issue/comment:
https://github.com/BlueBrain/Search/issues/518#issuecomment-985525160
"""
source: ArticleSource
path: str | pathlib.Path
element_in_file: int | None = None
article_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
journal_topics: dict[str, list[str]] = field(init=False, default_factory=dict)
def __post_init__(self) -> None:
"""Run the post-initialization."""
self.creation_date = datetime.datetime.now()
self.path = pathlib.Path(self.path).resolve()
@staticmethod
def _add_topics(
mapping: dict[str, list[str]], kind: str, new_topics: Iterable[str]
) -> None:
"""Add topics to a mapping with collection of topics.
Parameters
----------
mapping
A mapping of the form kind -> list-of-topics that shall be
updated in-place. For example ``{"MeSH": ["topic 1", "topic 2"]}``.
kind
The topic kind. Corresponds to a key in ``mapping``.
new_topics
The topics to add. Corresponds to a value in ``mapping``.
"""
topics = mapping.get(kind, [])
topics.extend(new_topics)
mapping[kind] = sorted(set(topics))
[docs] def add_article_topics(self, kind: str, topics: Iterable[str]) -> None:
"""Add article topics.
Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A collection of the topics to add.
"""
self._add_topics(self.article_topics, kind, topics)
[docs] def add_journal_topics(self, kind: str, topics: Iterable[str]) -> None:
"""Add journal topics.
Parameters
----------
kind
The topic kind. For example "MeSH" or "MAG".
topics
A collection of the topics to add.
"""
self._add_topics(self.journal_topics, kind, topics)
[docs] def json(self) -> dict:
"""Convert the contents of this class to a structured dictionary.
Apart from the source, path and topic entries a "metadata" top-level
key will be added containing a dictionary with entries "created-date"
and "bbs-version".
Returns
-------
dict
The structure dictionary with all topic information.
"""
metadata: dict[str, Any] = {
"created-date": self.creation_date.strftime("%Y-%m-%d %H:%M:%S"),
"bbs-version": bluesearch.__version__,
}
if self.element_in_file is not None:
metadata["element_in_file"] = self.element_in_file
json = {
"source": self.source.value,
"path": str(self.path),
"topics": {
"article": copy.deepcopy(self.article_topics),
"journal": copy.deepcopy(self.journal_topics),
},
"metadata": metadata,
}
return json
[docs] @classmethod
def from_dict(cls, data: dict) -> TopicInfo:
"""Parse topic info from a dictionary.
Parameters
----------
data
The dictionary to parse.
Returns
-------
TopicInfo
The parsed topic info.
"""
source = ArticleSource(data["source"])
path = data["path"]
element_in_file = data["metadata"].get("element_in_file")
topic_info = cls(source=source, path=path, element_in_file=element_in_file)
for topic_type, topics in data["topics"]["article"].items():
topic_info.add_article_topics(topic_type, topics)
for topic_type, topics in data["topics"]["journal"].items():
topic_info.add_journal_topics(topic_type, topics)
return topic_info