📮
郵便番号辞書 Mozc形式作成手順
MOZCの辞書形式で出力する手順とスクリプトです。
スクリプトはMITライセンスです。
phoepsilonix/japanese-zip-code-dictionary
wget -nc https://www.post.japanpost.jp/zipcode/dl/kogaki/zip/ken_all.zip
wget -nc https://www.post.japanpost.jp/zipcode/dl/jigyosyo/zip/jigyosyo.zip
unzip -o ken_all.zip
unzip -o jigyosyo.zip
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]] Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 KEN_ALL.CSV > KEN_ALL_UTF8.CSV
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]] Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 JIGYOSYO.CSV > JIGYOSYO_UTF8.CSV
ken_all-convert-mozc-dictionary.awk
BEGIN{
FS=","
f=0
}
{
gsub( "\"", "", $0 );
gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
if (f==$3) {
next;
} else
{
f=0
}
if (($9 ~ /(.+、/ || $9 ~ /(.*・/) && f==0) f = $3;
$9 = gensub(/([^(]*)(.*/, "\\1", "G", $9)
c[0] = gensub(/第?([0-90-9]+)地割(.*)/, "\\3", "G" ,$9)
}
{
if (!a[$3,$7,$8,c[0]]++) {
if(c[0] ~ /、/) {
if (c[0] ~ /町/) {
split(c[0], chou, "町")
split(chou[2], array, "、")
chou[1] = chou[1] "町"
} else {
split(c[0], array, "、")
chou[1] = ""
}
for (x in array) {
print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 chou[1] array[x] "\t" "地名" "\t"
}
} else {
print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 c[0] "\t" "地名" "\t"
}
}
}
jigyosyo-convert-mozc-dictionary.awk
BEGIN{
FS=","
f=0
}
{
gsub( "\"", "", $0 );
gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
if (f==$8) {
next;
} else
{
f=0
}
if (($7 ~ /(.+、/ || $7 ~ /(.*・/) && f==0) f = $8;
}
{
gsub(",",",",$3)
gsub("(","(",$3)
gsub(")",")",$3)
gsub("㈱","(株)",$3)
if (!a[$8,$4,$5,$6,$3]++) {
print substr($8, 1, 3) "-" substr($8,4,4) "\t" $4 $5 $6 " " $3 "\t" "組織" "\t"
}
}
awk -f ken_all-convert-mozc-dictionary.awk KEN_ALL_UTF8.CSV > KEN_ALL.txt
awk -f jigyosyo-convert-mozc-dictionary.awk JIGYOSYO_UTF8.CSV > JIGYOSYO.txt
Discussion