Source code for scrubadub.detectors.email

import re

from typing import Optional, Generator

from scrubadub.detectors.catalogue import register_detector
from .base import RegexDetector
from ..filth import EmailFilth, Filth


[docs]@register_detector class EmailDetector(RegexDetector): """Use regular expression magic to remove email addresses from dirty dirty ``text``. This method also catches email addresses like ``john at gmail.com``. """ filth_cls = EmailFilth name = 'email' autoload = True # there may be better solutions than this out there and this certainly # doesn't do that great of a job with people that spell out the # hyphenation of their email address, but its a pretty solid start. # # adapted from https://gist.github.com/dideler/5219706 regex = re.compile(( r"\b[a-z0-9!#$%&'*+\/=?^_`{|}~-]" # start with this character r"(?:" r" [\.a-z0-9!#$%&'*+\/=?^_`{|}~-]{0,62}" # valid next characters (max length 64 chars before @) r" [a-z0-9!#$%&'*+\/=?^_`{|}~-]" # end with this character r")?" r"(?:@|\sat\s)" # @ or the word 'at' instead r"[a-z0-9]" # domain starts like this r"(?:" r" (?=[a-z0-9-]*(\.|\sdot\s))" # A lookahead to ensure there is a dot in the domain r" (?:\.|\sdot\s|[a-z0-9-]){0,251}" # might have a '.' or the word 'dot' instead r" [a-z0-9]" # domain has max 253 chars, ends with one of these r")+\b" ), re.VERBOSE | re.IGNORECASE) at_matcher = re.compile(r"@|\sat\s", re.IGNORECASE) dot_matcher = re.compile(r"\.|\sdot\s", re.IGNORECASE)
[docs] def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Yields discovered filth in the provided ``text``. :param text: The dirty text to clean. :type text: str :param document_name: The name of the document to clean. :type document_name: str, optional :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ if re.search(self.at_matcher, text) and re.search(self.dot_matcher, text): yield from super().iter_filth(text=text, document_name=document_name)