😽

いろいろなプログラミング言語で NFD の濁音・半濁音を NFC に変換する

2024/07/16に公開

いろいろな言語で書いたコードを1つの記事にまとめることにした

JavaScript

test.js
str = "は\u{3099}は\u{3099}とハ\u{309A}ハ\u{309A}と神";

console.log(
    11 === str.length,
    7 === dakuon_normalize(str).length
);

function dakuon_normalize(str) {
    const pattern = /[\p{sc=Hiragana}\p{sc=Katakana}]\p{gc=Mn}/gu;
    return str.replace(pattern, function(matches) {
        return matches[0].normalize('NFC');
    });
}

Ruby

test.rb
def dakuon_normalize(str)
  pattern = /[\p{Hiragana}\p{Katakana}]\p{Mn}/
  str.gsub(pattern) {|m| m.unicode_normalize(:nfc) }
end

str = "は\u{3099}は\u{3099}とハ\u{309A}ハ\u{309A}と神"
p dakuon_normalize(str)

Python

test.py
import unicodedata
import regex


def dakuon_normalize(str):
    pattern = r'[\p{Hiragana}\p{Katakana}]\p{Mn}'
    return regex.sub(pattern,
        lambda m: unicodedata.normalize('NFC', m.group(0)),
        str)


str = "は\u3099は\u3099とハ\u309Aハ\u309Aと神"

print(len(str))
print(len(dakuon_normalize(str)))

PHP

test.php
<?php
$str = "は\u{3099}は\u{3099}とハ\u{309A}ハ\u{309A}と神";
$ret = 'ばばとパパと神';
var_dump(
    $ret === dakuon_normalize($str)
);

function dakuon_normalize(string $str): string {
    pattern = '/[\p{Hiragana}\p{Katakana}]\p{Mn}/su';
    return preg_replace_callback(
        pattern,
        fn($matches) => normalizer_normalize($matches[0]),
        $str);
}

Discussion