Source code for bluesearch.database.topic_rule

# Blue Brain Search is a text mining toolbox focused on scientific use cases.
#
# Copyright (C) 2020  Blue Brain Project, EPFL.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Implementation of the TopicRule data structure."""
from __future__ import annotations

import re
from typing import Any, Iterable

from bluesearch.database.article import ArticleSource
from bluesearch.database.topic_info import TopicInfo


[docs]class TopicRule: """Rule for accepting/rejecting an article based on topic matching criteria. Parameters ---------- level Level of the topic information to match, must be "article" or "journal". Passing `None` will match any level. source Article source, must be a valid ArticleSource (e.g. "arxiv", "pmc", ...). Passing `None` will match any source. pattern Regular expression for matching the topic names of a given article. Passing `None` will match the name of any topic. """ def __init__( self, level: str | None = None, source: str | ArticleSource | None = None, pattern: str | re.Pattern | None = None, ): if level is not None and level not in {"article", "journal"}: raise ValueError(f"Unsupported level {level}.") self.level = level self.source = ArticleSource(source) if source is not None else None self.pattern = re.compile(pattern) if pattern is not None else None
[docs] def match(self, topic_info: TopicInfo) -> bool: """Determine whether a topic_info matches the rule. Note that the keys (topic sources) of the `topic_info.article_topics` and `topic_info.journal_topics` dictionaries are completely disregarded. And all the values (lists) are simply concatenated. """ if self.source is not None and self.source is not topic_info.source: return False if self.pattern is None: return True if self.level is None or self.level == "article": for topic_list in topic_info.article_topics.values(): if any(self.pattern.search(topic) for topic in topic_list): return True if self.level is None or self.level == "journal": for topic_list in topic_info.journal_topics.values(): if any(self.pattern.search(topic) for topic in topic_list): return True return False
def __eq__(self, other: Any) -> bool: """Compare to another topic rule.""" if not isinstance(other, TopicRule): return False return ( self.level == other.level and self.source == other.source and self.pattern == other.pattern )
[docs]def check_topic_rules( topic_info: TopicInfo, topic_rules_accept: Iterable[TopicRule], topic_rules_reject: Iterable[TopicRule], ) -> bool: """Check whether the topic info of an article satisfies given topic rules. The `topic_info` needs to satisfy both of the below conditions to be accepted: * At least one rule within `topic_rules_accept` is satisfied * No rules in `topic_rules_reject` are satisfied Parameters ---------- topic_info Topic info to accept or reject. topic_rules_accept List of topic rules to accept a given topic_info. topic_rules_reject List of topic rules to reject a given topic_info. Returns ------- bool If True, the topic info matches satisfies both conditions explained above. If False, at least one of the conditions is not satisfied. """ # Go through rejection rules if any(reject_rule.match(topic_info) for reject_rule in topic_rules_reject): return False # Go through acceptance rules if any(accept_rule.match(topic_info) for accept_rule in topic_rules_accept): return True return False