個人情報マスク

ただし、現時点で英語とスペイン語のみ。

def mask_pii_text(text, language_code='en'):
    """
    Detects PII entities in the given text and masks them.

    :param text: The input text containing potential PII.
    :param language_code: The language code of the input text (default is 'ja' for Japanese).
    :return: The text with PII entities masked.
    """
    # Initialize the Comprehend client
    comprehend = boto3.client('comprehend')

    try:
        # Detect PII entities in the text
        response = comprehend.detect_pii_entities(Text=text, LanguageCode=language_code)
    except (BotoCoreError, ClientError) as error:
        print(f"Error detecting PII entities: {error}")
        return text  # Return the original text if there's an error

    pii_entities = response.get('Entities', [])

    if not pii_entities:
        print("No PII entities detected.")
        return text  # No PII detected; return the original text

    # Sort entities by their starting offset to handle replacements correctly
    pii_entities_sorted = sorted(pii_entities, key=lambda x: x['BeginOffset'])

    masked_text = text
    # To keep track of the shift in indices caused by masking
    shift = 0

    for entity in pii_entities_sorted:
        begin = entity['BeginOffset']
        end = entity['EndOffset']

        # Adjust offsets based on the current shift
        adjusted_begin = begin + shift
        adjusted_end = end + shift

        # Extract the PII segment to be masked
        pii_segment = masked_text[adjusted_begin:adjusted_end]

        # Create a mask of the same length (e.g., using asterisks)
        mask = '*' * (adjusted_end - adjusted_begin)

        # Replace the PII segment with the mask
        masked_text = (
            masked_text[:adjusted_begin] +
            mask +
            masked_text[adjusted_end:]
        )

        # Since we're replacing with the same number of characters, shift remains unchanged
        # If you choose to replace with a different length, adjust the shift accordingly

    return masked_text

mima_ita

固有表現抽出でPERSONとOTHERをマスクすることで近いことはできる。
この方法は日本語でも対応可能。しかし、OTHERはメールアドレスや電話番号以外でも引っかかる。たとえば「肺がん」はOtherとして抽出される。

あと、「人名センター長」などだと人名でなく組織名「人名センター」という組織名として抽出される結果になるケースがある

def mask_named_entities(text, language_code='ja'):
    """
    Detects named entities in the given text and masks them.

    :param text: The input text containing potential named entities.
    :param language_code: The language code of the input text (default is 'ja' for Japanese).
    :return: The text with named entities masked.
    """
    # Initialize the Comprehend client
    comprehend = boto3.client('comprehend')

    try:
        # Detect named entities in the text
        response = comprehend.detect_entities(Text=text, LanguageCode=language_code)
    except (BotoCoreError, ClientError) as error:
        print(f"Error detecting named entities: {error}")
        return text  # Return the original text if there's an error

    entities = response.get('Entities', [])

    if not entities:
        print("No named entities detected.")
        return text  # No entities detected; return the original text

    # Sort entities by their starting offset to handle replacements correctly
    entities_sorted = sorted(entities, key=lambda x: x['BeginOffset'])

    masked_text = text
    shift = 0

    # Mask named entities (focusing on PERSON and OTHER types)
    for entity in entities_sorted:
        if entity['Type'] in ['PERSON', 'OTHER']:  # Mask PERSON and OTHER types
            begin = entity['BeginOffset']
            end = entity['EndOffset']

            # Adjust offsets based on the current shift
            adjusted_begin = begin + shift
            adjusted_end = end + shift

            # Extract the entity segment to be masked
            entity_segment = masked_text[adjusted_begin:adjusted_end]

            # Create a mask of the same length (e.g., using asterisks)
            mask = '*' * (adjusted_end - adjusted_begin)

            # Replace the entity segment with the mask
            masked_text = (
                masked_text[:adjusted_begin] +
                mask +
                masked_text[adjusted_end:]
            )

            # Since we're replacing with the same number of characters, shift remains unchanged

    return masked_text

mima_ita

Google Cloud DLPでも個人情報のマスキングが可能。

価格：

# pip install google-cloud-dlp
from google.cloud import dlp_v2

def deidentify_with_masking(project, text_to_deidentify):
    # DLP クライアントを作成
    dlp = dlp_v2.DlpServiceClient()

    # DLP リクエスト用のコンフィグ
    parent = f"projects/{project}"
    
    # マスキング設定: 各文字を'*'にマスキングし、マスキング対象は無制限
    inspect_config = {
        "info_types": [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}],
    }
    
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "primitive_transformation": {
                        "character_mask_config": {
                            "masking_character": "*",
                            "number_to_mask": 0,
                        }
                    }
                }
            ]
        }
    }

    # DLP リクエストの作成
    item = {"value": text_to_deidentify}
    response = dlp.deidentify_content(
        request={
            "parent": parent,
            "deidentify_config": deidentify_config,
            "inspect_config": inspect_config,
            "item": item,
        }
    )

    # マスキングされた結果を表示
    print("Deidentified Content: {}".format(response.item.value))

# プロジェクトIDを指定
project_id = "mytestproject-422808"

# マスキング対象の日本語テキスト
text = "田中太郎のメールアドレスはtest@so-net.ne.jpです。"

# DLPを使ってマスキング
deidentify_with_masking(project_id, text)
# メールアドレスが失敗してる
# Deidentified Content: *****メールアドレスはtest@so-net.ne.jpです。