Source code for scrubadub.scrubbers

import re

import textblob
import phonenumbers
import nltk

from . import exceptions
from . import regexps


[docs]class Scrubber(object):
    """The Scrubber class is used to clean personal information out of
    dirty dirty text.
    """

[docs]    def clean_with_placeholders(self, text):
        """This is the master method that cleans all of the filth out of the
        dirty dirty ``text`` using the default options for all of the other
        ``clean_*`` methods below.
        """
        if not isinstance(text, unicode):
            raise exceptions.UnicodeRequired

        # * phone numbers needs to come before email addresses (#8)
        # * credentials need to come before email addresses (#9)
        # * skype needs to come before email addresses
        text = self.clean_proper_nouns(text)
        text = self.clean_urls(text)
        text = self.clean_phone_numbers(text)
        text = self.clean_credentials(text)
        text = self.clean_skype(text)
        text = self.clean_email_addresses(text)
        return text

[docs]    def clean_proper_nouns(self, text, replacement="{{NAME}}"):
        """Use part of speech tagging to clean proper nouns out of the dirty
        dirty ``text``.
        """

        # find the set of proper nouns using textblob. disallowed_nouns is a
        # workaround to make sure that downstream processing works correctly
        disallowed_nouns = set(["skype"])
        proper_nouns = set()
        blob = textblob.TextBlob(text)
        for word, part_of_speech in blob.tags:
            is_proper_noun = part_of_speech in ("NNP", "NNPS")
            if is_proper_noun and word.lower() not in disallowed_nouns:
                proper_nouns.add(word)

        # use a regex to replace the proper nouns by first escaping any
        # lingering punctuation in the regex
        # http://stackoverflow.com/a/4202559/564709
        for proper_noun in proper_nouns:
            proper_noun_re = r'\b' + re.escape(proper_noun) + r'\b'
            text = re.sub(proper_noun_re, replacement, text)
        return text

[docs]    def clean_email_addresses(self, text, replacement="{{EMAIL}}"):
        """Use regular expression magic to remove email addresses from dirty
        dirty ``text``. This method also catches email addresses like ``john at
        gmail.com``.
        """
        return regexps.EMAIL.sub(replacement, text)

[docs]    def clean_urls(self, text, replacement="{{URL}}", keep_domain=False):
        """Use regular expressions to remove URLs that begin with ``http://``,
        ``https://`` or ``www.`` from dirty dirty ``text``.

        With ``keep_domain=True``, this method only obfuscates the path on a
        URL, not its domain. For example,
        ``http://twitter.com/someone/status/234978haoin`` becomes
        ``http://twitter.com/{{replacement}}``.
        """
        for match in regexps.URL.finditer(text):
            beg = match.start()
            end = match.end()
            if keep_domain:
                rep = match.group('domain') + replacement
            else:
                rep = replacement
            text = text.replace(match.string[beg:end], rep)
        return text

[docs]    def clean_phone_numbers(self, text, replacement="{{PHONE}}", region="US"):
        """Remove phone numbers from dirty dirty ``text`` using
        `python-phonenumbers
        <https://github.com/daviddrysdale/python-phonenumbers>`_, a port of a
        Google project to correctly format phone numbers in text.

        ``region`` specifies the best guess region to start with (default:
        ``"US"``). Specify ``None`` to only consider numbers with a leading
        ``+`` to be considered.
        """
        # create a copy of text to handle multiple phone numbers correctly
        result = text
        for match in phonenumbers.PhoneNumberMatcher(text, region):
            result = result.replace(text[match.start:match.end], replacement)
        return result

[docs]    def clean_credentials(self, text,
                          username_replacement="{{USERNAME}}",
                          password_replacement="{{PASSWORD}}"):
        """Remove username/password combinations from dirty drity ``text``.
        """
        position = 0
        while True:
            match = regexps.CREDENTIALS.search(text, position)
            if match:
                ubeg, uend = match.span('username')
                pbeg, pend = match.span('password')
                text = (
                    text[:ubeg] + username_replacement + text[uend:pbeg] +
                    password_replacement + text[pend:]
                )
                position = match.end()
            else:
                break
        return text

[docs]    def clean_skype(self, text, replacement="{{SKYPE}}", word_radius=10):
        """Skype usernames tend to be used inline in dirty dirty text quite
        often but also appear as ``skype: {{SKYPE}}`` quite a bit. This method
        looks at words within ``word_radius`` words of "skype" for things that
        appear to be misspelled or have punctuation in them as a means to
        identify skype usernames.

        Default ``word_radius`` is 10, corresponding with the rough scale of
        half of a sentence before or after the word "skype" is used. Increasing
        the ``word_radius`` will increase the false positive rate and
        decreasing the ``word_radius`` will increase the false negative rate.
        """

        # find 'skype' in the text using a customized tokenizer. this makes
        # sure that all valid skype usernames are kept as tokens and not split
        # into different words
        tokenizer = nltk.tokenize.regexp.RegexpTokenizer(regexps.SKYPE_TOKEN)
        blob = textblob.TextBlob(text, tokenizer=tokenizer)
        skype_indices, tokens = [], []
        for i, token in enumerate(blob.tokens):
            tokens.append(token)
            if 'skype' in token.lower():
                skype_indices.append(i)

        # go through the words before and after skype words to identify
        # potential skype usernames.
        skype_usernames = []
        for i in skype_indices:
            jmin = max(i-word_radius, 0)
            jmax = min(i+word_radius+1, len(tokens))
            for j in range(jmin, i) + range(i+1, jmax):
                token = tokens[j]
                if regexps.SKYPE_USERNAME.match(token):

                    # this token is a valid skype username. Most skype
                    # usernames appear to be misspelled words. Word.spellcheck
                    # does not handle the situation of an all caps word very
                    # well, so we cast these to all lower case before checking
                    # whether the word is misspelled
                    if token.isupper():
                        token = token.lower()
                    word = textblob.Word(token)
                    suggestions = word.spellcheck()
                    corrected_word, score = suggestions[0]
                    if score < 0.5:
                        skype_usernames.append(token)

        # replace all skype usernames
        for skype_username in skype_usernames:
            text = text.replace(skype_username, replacement)

        return text