Open3
個人情報マスク
Amazon ComprehendのPIIで個人情報のマスキングが可能
ただし、現時点で英語とスペイン語のみ。
def mask_pii_text(text, language_code='en'):
"""
Detects PII entities in the given text and masks them.
:param text: The input text containing potential PII.
:param language_code: The language code of the input text (default is 'ja' for Japanese).
:return: The text with PII entities masked.
"""
# Initialize the Comprehend client
comprehend = boto3.client('comprehend')
try:
# Detect PII entities in the text
response = comprehend.detect_pii_entities(Text=text, LanguageCode=language_code)
except (BotoCoreError, ClientError) as error:
print(f"Error detecting PII entities: {error}")
return text # Return the original text if there's an error
pii_entities = response.get('Entities', [])
if not pii_entities:
print("No PII entities detected.")
return text # No PII detected; return the original text
# Sort entities by their starting offset to handle replacements correctly
pii_entities_sorted = sorted(pii_entities, key=lambda x: x['BeginOffset'])
masked_text = text
# To keep track of the shift in indices caused by masking
shift = 0
for entity in pii_entities_sorted:
begin = entity['BeginOffset']
end = entity['EndOffset']
# Adjust offsets based on the current shift
adjusted_begin = begin + shift
adjusted_end = end + shift
# Extract the PII segment to be masked
pii_segment = masked_text[adjusted_begin:adjusted_end]
# Create a mask of the same length (e.g., using asterisks)
mask = '*' * (adjusted_end - adjusted_begin)
# Replace the PII segment with the mask
masked_text = (
masked_text[:adjusted_begin] +
mask +
masked_text[adjusted_end:]
)
# Since we're replacing with the same number of characters, shift remains unchanged
# If you choose to replace with a different length, adjust the shift accordingly
return masked_text
固有表現抽出でPERSONとOTHERをマスクすることで近いことはできる。
この方法は日本語でも対応可能。しかし、OTHERはメールアドレスや電話番号以外でも引っかかる。たとえば「肺がん」はOtherとして抽出される。
あと、「人名センター長」などだと人名でなく組織名「人名センター」という組織名として抽出される結果になるケースがある
def mask_named_entities(text, language_code='ja'):
"""
Detects named entities in the given text and masks them.
:param text: The input text containing potential named entities.
:param language_code: The language code of the input text (default is 'ja' for Japanese).
:return: The text with named entities masked.
"""
# Initialize the Comprehend client
comprehend = boto3.client('comprehend')
try:
# Detect named entities in the text
response = comprehend.detect_entities(Text=text, LanguageCode=language_code)
except (BotoCoreError, ClientError) as error:
print(f"Error detecting named entities: {error}")
return text # Return the original text if there's an error
entities = response.get('Entities', [])
if not entities:
print("No named entities detected.")
return text # No entities detected; return the original text
# Sort entities by their starting offset to handle replacements correctly
entities_sorted = sorted(entities, key=lambda x: x['BeginOffset'])
masked_text = text
shift = 0
# Mask named entities (focusing on PERSON and OTHER types)
for entity in entities_sorted:
if entity['Type'] in ['PERSON', 'OTHER']: # Mask PERSON and OTHER types
begin = entity['BeginOffset']
end = entity['EndOffset']
# Adjust offsets based on the current shift
adjusted_begin = begin + shift
adjusted_end = end + shift
# Extract the entity segment to be masked
entity_segment = masked_text[adjusted_begin:adjusted_end]
# Create a mask of the same length (e.g., using asterisks)
mask = '*' * (adjusted_end - adjusted_begin)
# Replace the entity segment with the mask
masked_text = (
masked_text[:adjusted_begin] +
mask +
masked_text[adjusted_end:]
)
# Since we're replacing with the same number of characters, shift remains unchanged
return masked_text
Google Cloud DLPでも個人情報のマスキングが可能。
価格:
# pip install google-cloud-dlp
from google.cloud import dlp_v2
def deidentify_with_masking(project, text_to_deidentify):
# DLP クライアントを作成
dlp = dlp_v2.DlpServiceClient()
# DLP リクエスト用のコンフィグ
parent = f"projects/{project}"
# マスキング設定: 各文字を'*'にマスキングし、マスキング対象は無制限
inspect_config = {
"info_types": [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}],
}
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"character_mask_config": {
"masking_character": "*",
"number_to_mask": 0,
}
}
}
]
}
}
# DLP リクエストの作成
item = {"value": text_to_deidentify}
response = dlp.deidentify_content(
request={
"parent": parent,
"deidentify_config": deidentify_config,
"inspect_config": inspect_config,
"item": item,
}
)
# マスキングされた結果を表示
print("Deidentified Content: {}".format(response.item.value))
# プロジェクトIDを指定
project_id = "mytestproject-422808"
# マスキング対象の日本語テキスト
text = "田中太郎のメールアドレスはtest@so-net.ne.jpです。"
# DLPを使ってマスキング
deidentify_with_masking(project_id, text)
# メールアドレスが失敗してる
# Deidentified Content: *****メールアドレスはtest@so-net.ne.jpです。