Source code for scrubadub.detectors.email

import re

from typing import Optional, Generator

from scrubadub.detectors.catalogue import register_detector
from .base import RegexDetector
from ..filth import EmailFilth, Filth


[docs]@register_detector
class EmailDetector(RegexDetector):
    """Use regular expression magic to remove email addresses from dirty
    dirty ``text``. This method also catches email addresses like ``john at
    gmail.com``.
    """
    filth_cls = EmailFilth
    name = 'email'
    autoload = True

    # there may be better solutions than this out there and this certainly
    # doesn't do that great of a job with people that spell out the
    # hyphenation of their email address, but its a pretty solid start.
    #
    # adapted from https://gist.github.com/dideler/5219706
    regex = re.compile((
        r"\b[a-z0-9!#$%&'*+\/=?^_`{|}~-]"             # start with this character
        r"(?:"
        r"    [\.a-z0-9!#$%&'*+\/=?^_`{|}~-]{0,62}"   # valid next characters (max length 64 chars before @)
        r"    [a-z0-9!#$%&'*+\/=?^_`{|}~-]"           # end with this character
        r")?"
        r"(?:@|\sat\s)"                               # @ or the word 'at' instead
        r"[a-z0-9]"                                   # domain starts like this
        r"(?:"
        r"    (?=[a-z0-9-]*(\.|\sdot\s))"             # A lookahead to ensure there is a dot in the domain
        r"    (?:\.|\sdot\s|[a-z0-9-]){0,251}"        # might have a '.' or the word 'dot' instead
        r"    [a-z0-9]"                               # domain has max 253 chars, ends with one of these
        r")+\b"
    ), re.VERBOSE | re.IGNORECASE)

    at_matcher = re.compile(r"@|\sat\s", re.IGNORECASE)
    dot_matcher = re.compile(r"\.|\sdot\s", re.IGNORECASE)

[docs]    def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
        """Yields discovered filth in the provided ``text``.

        :param text: The dirty text to clean.
        :type text: str
        :param document_name: The name of the document to clean.
        :type document_name: str, optional
        :return: An iterator to the discovered :class:`Filth`
        :rtype: Iterator[:class:`Filth`]
        """

        if re.search(self.at_matcher, text) and re.search(self.dot_matcher, text):
            yield from super().iter_filth(text=text, document_name=document_name)