Source code for scrubadub.detectors.date_of_birth

"""
This modules provides a detector for date of births.

The filter_type = contextual is simply looking for any detected dates that are more than 18 years
and date of birth has been mentioned within 2 previous lines.

Another option is to nuke all date of birth that is between 18 and 100 years.

Using 18+ years may not be suitable for all use cases; use with caution.
"""
import re
import logging
from dateparser.search import search_dates
from datetime import datetime

from typing import Optional, List, Generator

from scrubadub.detectors.catalogue import register_detector
from .base import Detector
from ..filth.base import Filth
from ..filth.date_of_birth import DateOfBirthFilth


[docs]@register_detector class DateOfBirthDetector(Detector): """This detector aims to detect dates of birth in text. First all possible dates are found, then they are filtered to those that would result in people being between ``DateOfBirthFilth.min_age_years`` and ``DateOfBirthFilth.max_age_years``, which default to 18 and 100 respectively. If ``require_context`` is True, we search for one of the possible ``context_words`` near the found date. We search up to ``context_before`` lines before the date and up to ``context_after`` lines after the date. The context that we search for are terms like `'birth'` or `'DoB'` to increase the likelihood that the date is indeed a date of birth. The context words can be set using the ``context_words`` parameter, which expects a list of strings. >>> import scrubadub, scrubadub.detectors.date_of_birth >>> DateOfBirthFilth.min_age_years = 12 >>> scrubber = scrubadub.Scrubber(detector_list=[ ... scrubadub.detectors.date_of_birth.DateOfBirthDetector(), ... ]) >>> scrubber.clean("I was born on 10-Nov-2008.") 'I was born {{DATE_OF_BIRTH}}.' """ name = 'date_of_birth' filth_cls = DateOfBirthFilth autoload = False context_words_language_map = { 'en': ['birth', 'born', 'dob', 'd.o.b.'], 'de': ['geburt', 'geboren', 'geb', 'geb.'], }
[docs] def __init__(self, context_before: int = 2, context_after: int = 1, require_context: bool = True, context_words: Optional[List[str]] = None, **kwargs): """Initialise the detector. :param context_before: The number of lines of context to search before the date :type context_before: int :param context_after: The number of lines of context to search after the date :type context_after: int :param require_context: Set to False if your dates of birth are not near words that provide context (such as "birth" or "DOB"). :type require_context: bool :param context_words: A list of words that provide context related to dates of birth, such as the following: 'birth', 'born', 'dob' or 'd.o.b.'. :type context_words: bool :param name: Overrides the default name of the :class:``Detector`` :type name: str, optional :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". :type locale: str, optional """ super(DateOfBirthDetector, self).__init__(**kwargs) self.context_before = context_before self.context_after = context_after self.require_context = require_context try: self.context_words = self.context_words_language_map[self.language] except KeyError: raise ValueError("DateOfBirthDetector does not support language {}.".format(self.language)) if context_words is not None: self.context_words = context_words self.context_words = [word.lower() for word in self.context_words]
[docs] def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: """Search ``text`` for ``Filth`` and return a generator of ``Filth`` objects. :param text: The dirty text that this Detector should search :type text: str :param document_name: Name of the document this is being passed to this detector :type document_name: Optional[str] :return: The found Filth in the text :rtype: Generator[Filth] """ # using the dateparser lib - locale can be set here try: date_picker = search_dates(text, languages=[self.language]) except RecursionError: logger = logging.getLogger("scrubadub.detectors.date_of_birth.DateOfBirthDetector") logger.error(f"The document '{document_name}' caused a recursion error in dateparser.") raise if date_picker is None: return None lines = text.split('\n') for identified_string, identified_date in date_picker: # Skip anything that could be a phone number, dates rarely begin with a plus suspected_phone_number = str(identified_string).startswith('+') if suspected_phone_number: continue # Skip any dates that fall outside of the configured age range years_since_identified_date = datetime.now().year - identified_date.year within_age_range = (DateOfBirthFilth.min_age_years <= years_since_identified_date <= DateOfBirthFilth.max_age_years) if not within_age_range: continue # If its desired, search for context, if no context is found skip this identified date if self.require_context: found_context = False # Search line by line for the identified date string (identified_string) for i_line, line in enumerate(lines): if identified_string not in line: continue # when you find the identified_string, search for context from_line = max(i_line - self.context_before, 0) to_line = max(i_line + self.context_after + 1, 0) text_context = ' '.join(lines[from_line:to_line]).lower() found_context = any(context_word in text_context for context_word in self.context_words) # If you find any context around any instances of this string, all instance are PII if found_context: break # If we didn't find any context, this isnt PII, so skip this date if not found_context: continue found_dates = re.finditer(re.escape(identified_string), text) for instance in found_dates: yield DateOfBirthFilth( beg=instance.start(), end=instance.end(), text=instance.group(), detector_name=self.name, document_name=document_name, locale=self.locale, )
[docs] @classmethod def supported_locale(cls, locale: str) -> bool: """Returns true if this ``Detector`` supports the given locale. :param locale: The locale of the documents in the format: 2 letter lower-case language code eg "en", "es". :type locale: str :return: ``True`` if the locale is supported, otherwise ``False`` :rtype: bool """ language, region = cls.locale_split(locale) return language in cls.context_words_language_map.keys()
__all__ = ['DateOfBirthDetector']