Source code for scrubadub.detectors.skype

import re
import nltk
import textblob

from textblob.blob import BaseBlob
from textblob.en.taggers import PatternTagger

from typing import Optional, Generator

from scrubadub.detectors.catalogue import register_detector
from .base import RegexDetector
from ..filth import SkypeFilth, Filth

# BaseBlob uses NLTKTagger as a pos_tagger, but it works wrong
BaseBlob.pos_tagger = PatternTagger()


[docs]@register_detector class SkypeDetector(RegexDetector): """Skype usernames tend to be used inline in dirty dirty text quite often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method looks at words within ``word_radius`` words of "skype" for things that appear to be misspelled or have punctuation in them as a means to identify skype usernames. Default ``word_radius`` is 10, corresponding with the rough scale of half of a sentence before or after the word "skype" is used. Increasing the ``word_radius`` will increase the false positive rate and decreasing the ``word_radius`` will increase the false negative rate. """ filth_cls = SkypeFilth name = 'skype' autoload = False word_radius = 10 # these two regular expressions are used to validate a skype usernames. # _TOKEN is the core regular expression that is used to chunk text into # tokens to make sure all valid skype usernames are considered the same # token. Importantly, the word "skype" must pass the _SKYPE regex. # SKYPE_TOKEN is used to tokenize text and SKYPE_USERNAME is the same thing # but with the 6-32 character limit imposed on the username. adapted from # http://bit.ly/1FQs1hD _SKYPE = r'[a-zA-Z][a-zA-Z0-9_\-\,\.]' SKYPE_TOKEN = _SKYPE + '+' SKYPE_USERNAME = re.compile(_SKYPE+'{5,31}')
[docs] def iter_filth(self, text, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ # find 'skype' in the text using a customized tokenizer. this makes # sure that all valid skype usernames are kept as tokens and not split # into different words tokenizer = nltk.tokenize.regexp.RegexpTokenizer( self.SKYPE_TOKEN ) blob = textblob.TextBlob(text, tokenizer=tokenizer) skype_indices, tokens = [], [] for i, token in enumerate(blob.tokens): tokens.append(token) if 'skype' in token.lower(): skype_indices.append(i) # go through the words before and after skype words to identify # potential skype usernames. skype_usernames = [] for i in skype_indices: jmin = max(i-self.word_radius, 0) jmax = min(i+self.word_radius+1, len(tokens)) for j in list(range(jmin, i)) + list(range(i+1, jmax)): token = tokens[j] if self.SKYPE_USERNAME.match(token): # this token is a valid skype username. Most skype # usernames appear to be misspelled words. Word.spellcheck # does not handle the situation of an all caps word very # well, so we cast these to all lower case before checking # whether the word is misspelled if token.isupper(): token = token.lower() word = textblob.Word(token) suggestions = word.spellcheck() corrected_word, score = suggestions[0] if score < 0.5: skype_usernames.append(token) # replace all skype usernames if skype_usernames: self.regex = re.compile('|'.join(skype_usernames)) yield from super(SkypeDetector, self).iter_filth(text, document_name=document_name) return