Source code for scrubadub.detectors.postalcode

import re

from scrubadub.detectors.catalogue import register_detector
from .base import RegionLocalisedRegexDetector
from ..filth.postalcode import PostalCodeFilth


[docs]@register_detector class PostalCodeDetector(RegionLocalisedRegexDetector): """Detects postal codes, currently only British post codes are supported.""" filth_cls = PostalCodeFilth name = 'postalcode' autoload = True region_regex = { # Informed by https://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom#Validation # and validated against https://osdatahub.os.uk/downloads/open/CodePointOpen 'GB': re.compile(r""" ( # Girobank postcode (?:[gG][iI][rR] {0,}0[aA]{2})| (?: # British Overseas Territories in usual format (?: [aA][sS][cC][nN]| [sS][tT][hH][lL]| [tT][dD][cC][uU]| [bB][bB][nN][dD]| [bB][iI][qQ][qQ]| [fF][iI][qQ][qQ]| [pP][cC][rR][nN]| [sS][iI][qQ][qQ]| [iT][kK][cC][aA] ) \ {0,}1[zZ]{2} )| (?: # British Overseas Territories in zip-code format (KY[0-9]|MSR|VG|AI)[ -]{0,}[0-9]{4} )| # (?: # Bermuda including this causes too many false positives, so excluded for now # [a-zA-Z]{2}\ {0,}[0-9]{2} # )| (?: # British Forces Post Office [Bb][Ff][Pp][Oo]\ {0,}[0-9]{1,4} )| (?: # Mainland British postcodes (?: (?:[Ww][Cc][0-9][abehmnprvwxyABEHMNPRVWXY])| (?:[Ee][Cc][1-4][abehmnprvwxyABEHMNPRVWXY])| (?:[Nn][Ww]1[Ww])| (?:[Ss][Ee]1[Pp])| (?:[Ss][Ww]1[abehmnprvwxyABEHMNPRVWXY])| (?:[EeNnWw]1[a-hjkpstuwA-HJKPSTUW])| (?:[BbEeGgLlMmNnSsWw][0-9][0-9]?)| (?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY][0-9][0-9]?) ) \ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2} ) ) """, re.VERBOSE), }