Source code for scrubadub.detectors.text_blob

import re
import textblob

from textblob.blob import BaseBlob
from textblob.en.taggers import PatternTagger

from typing import Optional, Generator

from scrubadub.detectors.catalogue import register_detector
from .base import RegexDetector
from ..filth import NameFilth, Filth
from ..utils import CanonicalStringSet

# BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong
BaseBlob.pos_tagger = PatternTagger()


[docs]@register_detector class TextBlobNameDetector(RegexDetector): """Use part of speech tagging from textblob to clean proper nouns out of the dirty dirty ``text``. Disallow particular nouns by adding them to the ``NameDetector.disallowed_nouns`` set. """ filth_cls = NameFilth name = 'text_blob_name' autoload = False disallowed_nouns = CanonicalStringSet(["skype"])
[docs] def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ if not isinstance(self.disallowed_nouns, CanonicalStringSet): raise TypeError( 'NameDetector.disallowed_nouns must be CanonicalStringSet' ) # find the set of proper nouns using textblob. proper_nouns = set() blob = textblob.TextBlob(text) for word, part_of_speech in blob.tags: is_proper_noun = part_of_speech in ("NNP", "NNPS") if is_proper_noun and word.lower() not in self.disallowed_nouns: proper_nouns.add(word) # use a regex to replace the proper nouns by first escaping any # lingering punctuation in the regex # http://stackoverflow.com/a/4202559/564709 if proper_nouns: re_list = [] for proper_noun in proper_nouns: re_list.append(r'\b' + re.escape(str(proper_noun)) + r'\b') self.regex = re.compile('|'.join(re_list)) yield from super(TextBlobNameDetector, self).iter_filth(text, document_name=document_name) return
[docs] @classmethod def supported_locale(cls, locale: str) -> bool: """Returns true if this ``Detector`` supports the given locale. :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str :return: ``True`` if the locale is supported, otherwise ``False`` :rtype: bool """ language, region = cls.locale_split(locale) # fr and de are possible through plugins, but need to be implemented on this end # https://github.com/sloria/textblob-fr and https://github.com/markuskiller/textblob-de return language in ['en', ]