Open8
web開発者のための大規模サービス技術入門の課題をGo言語でやってみる
圧縮プログラミング
疑似コードから書き起こし。ある程度大きな整数になると損し始める。とりあえず、整数だけのファイルを作って、圧縮してみるだけの処理なので、課題のcsvはまだ扱えてない。
main.go
package main
import (
"crypto/rand"
"encoding/binary"
"flag"
"fmt"
"io"
"log"
"math/big"
"os"
)
const (
MASK = 1<<7 - 1
VBENCODE_FLAG = 1 << 7
)
func VBEncodeNumber(v uint32) []byte {
bytes := make([]byte, 5)
i := len(bytes) - 1
for {
bytes[i] = byte(int(v) & MASK)
if v < VBENCODE_FLAG {
break
}
v >>= 7
i--
}
bytes[4] |= VBENCODE_FLAG
return bytes[i:]
}
func VBDecode(byteStream []byte) []uint32 {
out := make([]uint32, 0, 1000)
n := 0
for _, b := range byteStream {
if b&VBENCODE_FLAG != 0 {
n = n<<7 + (int(b) - VBENCODE_FLAG)
out = append(out, uint32(n))
n = 0
} else {
n = n<<7 + int(b)
}
}
return out
}
func GenerateInput(n int, dst string) {
w, err := os.Create(dst)
if err != nil {
log.Fatal(err)
}
defer w.Close()
for i := 0; i < n; i++ {
v, err := rand.Int(rand.Reader, big.NewInt(1<<28-1))
if err != nil {
panic(err)
}
fmt.Println(v.Int64())
binary.Write(w, binary.LittleEndian, uint32(v.Int64()))
}
}
func main() {
var src, dst string
var writeFlag bool
var generateNumber int
flag.StringVar(&src, "src", "", "src file path")
flag.StringVar(&dst, "o", "out.bin", "output file path")
flag.BoolVar(&writeFlag, "w", false, "write flag")
flag.IntVar(&generateNumber, "g", 0, "create input")
flag.Parse()
if generateNumber > 0 {
if dst == "" {
dst = "input.txt"
}
GenerateInput(generateNumber, dst)
return
} else {
if _, err := os.Stat(src); err != nil {
flag.Usage()
log.Fatal(fmt.Sprintf("src='%v':", src), err)
}
}
if writeFlag {
r, err := os.Open(src)
if err != nil {
log.Fatal(err)
}
defer r.Close()
w, err := os.Create(dst)
if err != nil {
log.Fatal(err)
}
defer w.Close()
byteStream := make([]byte, 0, 10000)
for {
var vv uint32
err = binary.Read(r, binary.LittleEndian, &vv)
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}
byteStream = append(byteStream, VBEncodeNumber(vv)...)
}
w.Write(byteStream)
} else {
r, err := os.Open(src)
if err != nil {
log.Fatal(err)
}
defer r.Close()
bs, err := io.ReadAll(r)
if err != nil {
log.Fatal(err)
}
numbers := VBDecode(bs)
for _, v := range numbers {
fmt.Println(v)
}
}
}
$ go run . -g 1000 -o 1000.bin #1000個の整数をランダムに生成して1000.binに
$ go run . -w -src 1000.bin -o 1000.out # 圧縮して1000.outに保存
$ go run . -src 1000.out # 解凍して標準出力
前回、コマンドオプションが無法地帯になってたのでspf13/cobraを導入してサブコマンドにした。
.
├── cmd
│ ├── decode.go
│ ├── encode.go
│ ├── generate.go
│ └── root.go
├── go.mod
├── go.sum
├── main.go
└── vbencode
├── vbencode.go
└── vbencode_test.go
$ go run . generate --help
Usage:
ch06 generate [flags]
Flags:
-h, --help help for generate
-n, --number int 生成される個数 (default 10)
-o, --out string output file (default "input.bin")
$ go run . encode --help
Usage:
ch06 encode [flags]
Flags:
-h, --help help for encode
-o, --out string output file (default "out.bin")
go run . decode --help
Usage:
ch06 decode [flags]
Flags:
-h, --help help for decode
一応完成。decodeが遅い。多分、fmt.Print
系使っているのが原因かな。[]byte
をガッと確保して書き込んだ方が早そう。実際のところはよく分からないけれども。あと実はbinary.Read
とか使う必要ないと思う。generate
はちゃんとしようとすると面倒なので、削除。
encode.go
package cmd
import (
"bufio"
"encoding/binary"
"fmt"
"io"
"log"
"os"
"strconv"
"strings"
"example.com/ch06/vbencode"
"github.com/spf13/cobra"
)
func init() {
var dst string
encodeCmd := &cobra.Command{
Use: "encode",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
src := args[0]
if _, err := os.Stat(src); err != nil {
return fmt.Errorf("src='%v'", src)
}
r, err := os.Open(src)
if err != nil {
log.Fatal(err)
}
defer r.Close()
w, err := os.Create(dst)
if err != nil {
log.Fatal(err)
}
defer w.Close()
reader := bufio.NewReaderSize(r, 8192)
for {
byteStream := make([]byte, 0, 10000)
line := make([]byte, 0, 10000)
var isContinue bool = true
for isContinue {
bs, isPrefix, err := reader.ReadLine()
if err == io.EOF {
goto end
}
line = append(line, bs...)
isContinue = isPrefix
}
a := strings.SplitN(string(line), "\t", 2)
tag := []byte(a[0])
binary.Write(w, binary.LittleEndian, uint32(len(tag)))
var pre int
for _, num := range strings.Split(a[1], ",") {
v, err := strconv.Atoi(num)
if err != nil {
return err
}
byteStream = append(byteStream, vbencode.Encode(uint32(v-pre))...)
pre = v
}
binary.Write(w, binary.LittleEndian, uint32(len(byteStream)))
w.Write(tag)
w.Write(byteStream)
}
end:
return nil
},
}
encodeCmd.Flags().StringVarP(&dst, "out", "o", "out.bin", "output file")
rootCmd.AddCommand(encodeCmd)
}
decode.go
package cmd
import (
"encoding/binary"
"fmt"
"io"
"log"
"os"
"example.com/ch06/vbencode"
"github.com/spf13/cobra"
)
func init() {
encodeCmd := &cobra.Command{
Use: "decode",
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
r, err := os.Open(args[0])
if err != nil {
log.Fatal(err)
}
defer r.Close()
for {
var tagLen, numLen uint32
err = binary.Read(r, binary.LittleEndian, &tagLen)
if err == io.EOF {
break
}
err = binary.Read(r, binary.LittleEndian, &numLen)
if err == io.EOF {
break
}
tag := make([]byte, tagLen)
_, err = r.Read(tag)
if err == io.EOF {
break
}
fmt.Printf("%s\t", string(tag))
bs := make([]byte, numLen)
_, err = r.Read(bs)
if err == io.EOF {
break
}
numbers := vbencode.Decode(bs)
n := len(numbers) - 1
var pre uint32
for i, v := range numbers {
fmt.Print(v + pre)
if i < n {
fmt.Print(",")
}
pre = v + pre
}
fmt.Println("")
}
},
}
rootCmd.AddCommand(encodeCmd)
}
書籍の情報と違ってデータの容量は172MBで圧縮後が40MBだった。decodeしてdiffとって差分が出なかったので、とりあえず良し。
$ go run . encode ./data/eid_tags.txt -o out.bin
$ ls -lh ./data/eid_tags.txt out.bin
-rw-r--r--@ 1 nabeyang staff 172M 1 22 13:12 ./data/eid_tags.txt
-rw-r--r-- 1 nabeyang staff 40M 1 29 18:03 out.bin
キーワードリンク
ディレクトリ構造はこんな感じ。サブコマンドなんて要らなかった。
├── cmd
│ ├── replace.go
│ └── root.go
├── go.mod
├── go.sum
├── main.go
└── trie
├── trie.go
└── trie_test.go
書籍と違い、入力文字列の中のキーワードを*
で囲む変換をして標準出力するようにしてみた。AC法ではなく、普通にTrie木で。Trie木はここの高速化前の実装を参考に。
replace.go
package cmd
import (
"bufio"
"fmt"
"html"
"io"
"os"
"example.com/ch08/trie"
"github.com/spf13/cobra"
)
func init() {
printCmd := &cobra.Command{
Use: "replace",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
r, err := os.Open(args[0])
if err != nil {
return err
}
defer r.Close()
words := make([]string, 0, 1000)
rr := bufio.NewReaderSize(r, 8192)
var word string
for {
line := make([]byte, 0, 10000)
var isContinue bool = true
for isContinue {
bs, isPrefix, err := rr.ReadLine()
if err == io.EOF {
goto end
}
line = append(line, bs...)
isContinue = isPrefix
}
word = html.UnescapeString(string(line))
words = append(words, word)
}
end:
in := bufio.NewReader(os.Stdin)
var input string
fmt.Fscan(in, &input)
t := trie.NewTrie(words)
fmt.Println(t.ReplaceAll(input, func(word string) string {
return fmt.Sprintf(" *%s* ", word)
}))
return nil
},
}
rootCmd.AddCommand(printCmd)
}
trie.go
package trie
type node struct {
children map[rune]*node
value string
}
func newNode() *node {
return &node{
children: make(map[rune]*node),
}
}
type Trie struct {
root *node
}
func NewTrie(keys []string) *Trie {
t := &Trie{
root: newNode(),
}
for _, key := range keys {
t.add(key)
}
return t
}
func (t *Trie) Match(input string, isGreedly bool) [][2]int {
var results = make([][2]int, 0, 100)
var r [][2]int
chars := []rune(input)
e := len(chars)
b := 0
for b < e {
r = t.search(chars, b, isGreedly)
n := len(r)
if n > 0 {
results = append(results, r...)
b = r[n-1][0] + 1
} else {
b++
}
}
return results
}
func (t *Trie) ReplaceAll(input string, replace func(string) string) string {
chars := []rune(input)
n := len(chars)
dist := make([]rune, 0, n)
results := t.Match(input, true)
if len(results) == 0 {
return input
}
start := 0
for _, r := range results {
pos, len := r[0], r[1]
if pos < start {
continue
}
dist = append(dist, chars[start:pos]...)
dist = append(dist, []rune(replace(string(chars[pos:pos+len])))...)
start = pos + len
}
if start < n {
dist = append(dist, chars[start:]...)
}
return string(dist)
}
func (t *Trie) add(key string) {
n := t.root
for _, ch := range key {
nn, ok := n.children[ch]
if !ok {
nn = newNode()
}
n.children[ch] = nn
n = nn
}
n.value = key
}
// (開始, 長さ)
func (t *Trie) search(chars []rune, start int, isGreedly bool) [][2]int {
end := len(chars)
results := make([][2]int, 0, 100)
for i := start; i < end; i++ {
ts := make([][2]int, 0, 100)
if n, ok := t.root.children[chars[i]]; ok {
if n.value != "" {
ts = append(ts, [2]int{i, 1})
}
for j := i + 1; j < end; j++ {
if n, ok = n.children[chars[j]]; ok {
if n.value != "" {
ts = append(ts, [2]int{i, j - i + 1})
}
} else {
break
}
}
} else {
break
}
n := len(ts)
if isGreedly {
if n > 0 {
results = append(results, ts[n-1])
}
} else {
results = append(results, ts...)
}
}
return results
}
実行
go run . replace /path/to/keyword.utf8.uniq.txt < /path/to/input.txt
テストはこんな感じで。
trie_test.go
package trie
import (
"fmt"
"testing"
)
func TestAdd(t *testing.T) {
tr := NewTrie([]string{"ab", "abcde", "bc", "bab", "d"})
expect := "abcde"
actual := tr.root.
children['a'].children['b'].children['c'].
children['d'].children['e'].value
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
expect = "ab"
actual = tr.root.
children['a'].children['b'].value
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
expect = "bc"
actual = tr.root.
children['b'].children['c'].value
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
expect = "bab"
actual = tr.root.
children['b'].children['a'].children['b'].value
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
expect = "d"
actual = tr.root.
children['d'].value
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
}
func TestSearch(t *testing.T) {
tr := NewTrie([]string{"a", "he", "hers", "his", "she"})
input := "a his hoge hershe"
results := tr.Match(input, false)
expects := [][2]int{
{0, 1},
{2, 3},
{11, 2},
{11, 4},
{14, 3},
{15, 2},
}
if len(results) != len(expects) {
t.Fatalf("len(results) equals len(expects) but %d != %d", len(results), len(expects))
}
chars := []rune(input)
for i, expect := range expects {
r := results[i]
j, k := r[0], r[1]
l, m := expect[0], expect[1]
if l != j || m != k {
t.Errorf("expected %s:(%d, %d), but got %s:(%d, %d)", string(chars[l:l+m]), l, m, string(chars[j:j+k]), j, k)
}
}
}
func TestSearch2(t *testing.T) {
tr := NewTrie([]string{"a", "he", "hers", "his", "she"})
input := "a his hoge hershe"
results := tr.Match(input, true)
expects := [][2]int{
{0, 1},
{2, 3},
{11, 4},
{14, 3},
{15, 2},
}
if len(results) != len(expects) {
t.Fatalf("len(results) equals len(expects) but %d != %d", len(results), len(expects))
}
chars := []rune(input)
for i, expect := range expects {
r := results[i]
j, k := r[0], r[1]
l, m := expect[0], expect[1]
if l != j || m != k {
t.Errorf("expected %s:(%d, %d), but got %s:(%d, %d)", string(chars[l:l+m]), l, m, string(chars[j:j+k]), j, k)
}
}
}
func TestReplaceAll(t *testing.T) {
tr := NewTrie([]string{"世界"})
input := "こんにちは、世界"
actual := tr.ReplaceAll(input, func(word string) string {
return fmt.Sprintf("*%s*", word)
})
expect := "こんにちは、*世界*"
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
}
func TestReplaceAll2(t *testing.T) {
tr := NewTrie([]string{"he", "hers", "his", "she"})
input := "a his hoge hershe"
actual := tr.ReplaceAll(input, func(word string) string {
return fmt.Sprintf("*%s*", word)
})
expect := "a *his* hoge *hers**he*"
if actual != expect {
t.Errorf("expected %s, but got %s", expect, actual)
}
}