📮

郵便番号辞書 Mozc形式作成手順

2024/03/30に公開

MOZCの辞書形式で出力する手順とスクリプトです。
スクリプトはMITライセンスです。
phoepsilonix/japanese-zip-code-dictionary

wget -nc https://www.post.japanpost.jp/zipcode/dl/kogaki/zip/ken_all.zip
wget -nc https://www.post.japanpost.jp/zipcode/dl/jigyosyo/zip/jigyosyo.zip
unzip -o ken_all.zip
unzip -o jigyosyo.zip
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]]  Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 KEN_ALL.CSV > KEN_ALL_UTF8.CSV
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]]  Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 JIGYOSYO.CSV > JIGYOSYO_UTF8.CSV 
ken_all-convert-mozc-dictionary.awk
BEGIN{
    FS=","
    f=0
}
{
    gsub( "\"", "", $0 );
    gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
    if (f==$3) {
        next;
    } else
    {
        f=0
    }
    if (($9 ~ /(.+、/ || $9 ~ /(.*・/) && f==0) f = $3;
    $9 = gensub(/([^(]*)(.*/, "\\1", "G", $9)
    c[0] = gensub(/第?([0-90-9]+)地割(.*)/, "\\3", "G" ,$9)
}
{
    if (!a[$3,$7,$8,c[0]]++) {
        if(c[0] ~ /、/) {
            if (c[0] ~ /町/) {
                split(c[0], chou, "町")
                split(chou[2], array, "、")
                chou[1] = chou[1] "町"
            } else {
            split(c[0], array, "、")
            chou[1] = ""
        }
        for (x in array) {
            print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 chou[1] array[x] "\t" "地名" "\t"
        }
        } else {
            print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 c[0] "\t" "地名" "\t"
        }
    }
}
jigyosyo-convert-mozc-dictionary.awk
BEGIN{
    FS=","
    f=0
}
{
    gsub( "\"", "", $0 );
    gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
    if (f==$8) {
        next;
    } else
    {
        f=0
    }
    if (($7 ~ /(.+、/ || $7 ~ /(.*・/) && f==0) f = $8;
}
{
    gsub(",",",",$3)
    gsub("(","(",$3)
    gsub(")",")",$3)
    gsub("㈱","(株)",$3)
    if (!a[$8,$4,$5,$6,$3]++) {
        print substr($8, 1, 3) "-" substr($8,4,4) "\t" $4 $5 $6 " " $3 "\t" "組織" "\t"
    }
}
awk -f ken_all-convert-mozc-dictionary.awk KEN_ALL_UTF8.CSV > KEN_ALL.txt
awk -f jigyosyo-convert-mozc-dictionary.awk JIGYOSYO_UTF8.CSV > JIGYOSYO.txt

Discussion