Source code for scrubadub.detectors.url

import re

from scrubadub.detectors.catalogue import register_detector
from .base import RegexDetector
from ..filth import UrlFilth


[docs]@register_detector
class UrlDetector(RegexDetector):
    """Use regular expressions to remove URLs that begin with ``http://``,
    ``https://`` or ``www.`` from dirty dirty ``text``.

    With ``keep_domain=True``, this detector only obfuscates the path on a
    URL, not its domain. For example,
    ``http://twitter.com/someone/status/234978haoin`` becomes
    ``http://twitter.com/{{replacement}}``.
    """
    filth_cls = UrlFilth
    name = 'url'
    autoload = True

    # this regular expression is convenient for captures the domain name
    # and the path separately, which is useful for keeping the domain name
    # but sanitizing the path altogether
    regex = re.compile(r'''
        (?P<domain>
            (https?:\/\/(www\.)?|www\.)          # protocol http://, etc
            [\-\w@:%\.\+~\#=]{2,256}\.[a-z]{2,6} # domain name
            /?                                   # can have a trailing slash
        )(?P<path>
            [\-\w@:%\+\.~\#?&/=]*                # rest of path, query, & hash
        )
    ''', re.VERBOSE)