Open8

web開発者のための大規模サービス技術入門の課題をGo言語でやってみる

nabeyangnabeyang

圧縮プログラミング

疑似コードから書き起こし。ある程度大きな整数になると損し始める。とりあえず、整数だけのファイルを作って、圧縮してみるだけの処理なので、課題のcsvはまだ扱えてない。

main.go
package main

import (
	"crypto/rand"
	"encoding/binary"
	"flag"
	"fmt"
	"io"
	"log"
	"math/big"
	"os"
)

const (
	MASK          = 1<<7 - 1
	VBENCODE_FLAG = 1 << 7
)

func VBEncodeNumber(v uint32) []byte {
	bytes := make([]byte, 5)
	i := len(bytes) - 1
	for {
		bytes[i] = byte(int(v) & MASK)
		if v < VBENCODE_FLAG {
			break
		}
		v >>= 7
		i--
	}
	bytes[4] |= VBENCODE_FLAG
	return bytes[i:]
}

func VBDecode(byteStream []byte) []uint32 {
	out := make([]uint32, 0, 1000)
	n := 0
	for _, b := range byteStream {
		if b&VBENCODE_FLAG != 0 {
			n = n<<7 + (int(b) - VBENCODE_FLAG)
			out = append(out, uint32(n))
			n = 0
		} else {
			n = n<<7 + int(b)
		}
	}
	return out
}

func GenerateInput(n int, dst string) {
	w, err := os.Create(dst)
	if err != nil {
		log.Fatal(err)
	}
	defer w.Close()
	for i := 0; i < n; i++ {
		v, err := rand.Int(rand.Reader, big.NewInt(1<<28-1))
		if err != nil {
			panic(err)
		}
		fmt.Println(v.Int64())
		binary.Write(w, binary.LittleEndian, uint32(v.Int64()))

	}

}

func main() {
	var src, dst string
	var writeFlag bool
	var generateNumber int
	flag.StringVar(&src, "src", "", "src file path")
	flag.StringVar(&dst, "o", "out.bin", "output file path")
	flag.BoolVar(&writeFlag, "w", false, "write flag")
	flag.IntVar(&generateNumber, "g", 0, "create input")
	flag.Parse()
	if generateNumber > 0 {
		if dst == "" {
			dst = "input.txt"
		}
		GenerateInput(generateNumber, dst)
		return
	} else {
		if _, err := os.Stat(src); err != nil {
			flag.Usage()
			log.Fatal(fmt.Sprintf("src='%v':", src), err)
		}
	}
	if writeFlag {
		r, err := os.Open(src)
		if err != nil {
			log.Fatal(err)
		}
		defer r.Close()
		w, err := os.Create(dst)
		if err != nil {
			log.Fatal(err)
		}
		defer w.Close()
		byteStream := make([]byte, 0, 10000)
		for {
			var vv uint32
			err = binary.Read(r, binary.LittleEndian, &vv)
			if err == io.EOF {
				break
			}
			if err != nil {
				log.Fatal(err)
			}
			byteStream = append(byteStream, VBEncodeNumber(vv)...)
		}
		w.Write(byteStream)
	} else {
		r, err := os.Open(src)
		if err != nil {
			log.Fatal(err)
		}
		defer r.Close()
		bs, err := io.ReadAll(r)
		if err != nil {
			log.Fatal(err)
		}
		numbers := VBDecode(bs)
		for _, v := range numbers {
			fmt.Println(v)
		}
	}
}
nabeyangnabeyang
$ go run . -g 1000 -o 1000.bin #1000個の整数をランダムに生成して1000.binに
$ go run . -w -src 1000.bin -o 1000.out # 圧縮して1000.outに保存
$ go run . -src 1000.out # 解凍して標準出力
nabeyangnabeyang

前回、コマンドオプションが無法地帯になってたのでspf13/cobraを導入してサブコマンドにした。

.
├── cmd
│   ├── decode.go
│   ├── encode.go
│   ├── generate.go
│   └── root.go
├── go.mod
├── go.sum
├── main.go
└── vbencode
    ├── vbencode.go
    └── vbencode_test.go
$ go run . generate --help
Usage:
  ch06 generate [flags]

Flags:
  -h, --help         help for generate
  -n, --number int   生成される個数 (default 10)
  -o, --out string   output file (default "input.bin")
$ go run . encode --help  
Usage:
  ch06 encode [flags]

Flags:
  -h, --help         help for encode
  -o, --out string   output file (default "out.bin")
go run . decode --help
Usage:
  ch06 decode [flags]

Flags:
  -h, --help   help for decode
nabeyangnabeyang

一応完成。decodeが遅い。多分、fmt.Print系使っているのが原因かな。[]byteをガッと確保して書き込んだ方が早そう。実際のところはよく分からないけれども。あと実はbinary.Readとか使う必要ないと思う。generateはちゃんとしようとすると面倒なので、削除。

encode.go
package cmd

import (
	"bufio"
	"encoding/binary"
	"fmt"
	"io"
	"log"
	"os"
	"strconv"
	"strings"

	"example.com/ch06/vbencode"
	"github.com/spf13/cobra"
)

func init() {
	var dst string
	encodeCmd := &cobra.Command{
		Use:  "encode",
		Args: cobra.ExactArgs(1),
		RunE: func(cmd *cobra.Command, args []string) error {
			src := args[0]
			if _, err := os.Stat(src); err != nil {
				return fmt.Errorf("src='%v'", src)
			}
			r, err := os.Open(src)
			if err != nil {
				log.Fatal(err)
			}
			defer r.Close()
			w, err := os.Create(dst)
			if err != nil {
				log.Fatal(err)
			}
			defer w.Close()

			reader := bufio.NewReaderSize(r, 8192)
			for {
				byteStream := make([]byte, 0, 10000)
				line := make([]byte, 0, 10000)
				var isContinue bool = true
				for isContinue {
					bs, isPrefix, err := reader.ReadLine()
					if err == io.EOF {
						goto end
					}
					line = append(line, bs...)
					isContinue = isPrefix
				}
				a := strings.SplitN(string(line), "\t", 2)
				tag := []byte(a[0])
				binary.Write(w, binary.LittleEndian, uint32(len(tag)))
				var pre int
				for _, num := range strings.Split(a[1], ",") {
					v, err := strconv.Atoi(num)
					if err != nil {
						return err
					}
					byteStream = append(byteStream, vbencode.Encode(uint32(v-pre))...)
					pre = v
				}
				binary.Write(w, binary.LittleEndian, uint32(len(byteStream)))
				w.Write(tag)
				w.Write(byteStream)
			}
		end:
			return nil
		},
	}
	encodeCmd.Flags().StringVarP(&dst, "out", "o", "out.bin", "output file")
	rootCmd.AddCommand(encodeCmd)
}
decode.go
package cmd

import (
	"encoding/binary"
	"fmt"
	"io"
	"log"
	"os"

	"example.com/ch06/vbencode"
	"github.com/spf13/cobra"
)

func init() {
	encodeCmd := &cobra.Command{
		Use:  "decode",
		Args: cobra.ExactArgs(1),
		Run: func(cmd *cobra.Command, args []string) {
			r, err := os.Open(args[0])
			if err != nil {
				log.Fatal(err)
			}
			defer r.Close()
			for {
				var tagLen, numLen uint32
				err = binary.Read(r, binary.LittleEndian, &tagLen)
				if err == io.EOF {
					break
				}
				err = binary.Read(r, binary.LittleEndian, &numLen)
				if err == io.EOF {
					break
				}
				tag := make([]byte, tagLen)
				_, err = r.Read(tag)
				if err == io.EOF {
					break
				}
				fmt.Printf("%s\t", string(tag))
				bs := make([]byte, numLen)
				_, err = r.Read(bs)
				if err == io.EOF {
					break
				}
				numbers := vbencode.Decode(bs)
				n := len(numbers) - 1
				var pre uint32
				for i, v := range numbers {
					fmt.Print(v + pre)
					if i < n {
						fmt.Print(",")
					}
					pre = v + pre
				}
				fmt.Println("")
			}
		},
	}
	rootCmd.AddCommand(encodeCmd)
}
nabeyangnabeyang

書籍の情報と違ってデータの容量は172MBで圧縮後が40MBだった。decodeしてdiffとって差分が出なかったので、とりあえず良し。

$ go run . encode ./data/eid_tags.txt -o out.bin
$ ls -lh ./data/eid_tags.txt out.bin 
-rw-r--r--@ 1 nabeyang  staff   172M  1 22 13:12 ./data/eid_tags.txt
-rw-r--r--  1 nabeyang  staff    40M  1 29 18:03 out.bin
nabeyangnabeyang

キーワードリンク

ディレクトリ構造はこんな感じ。サブコマンドなんて要らなかった。

├── cmd
│   ├── replace.go
│   └── root.go
├── go.mod
├── go.sum
├── main.go
└── trie
    ├── trie.go
    └── trie_test.go

書籍と違い、入力文字列の中のキーワードを*で囲む変換をして標準出力するようにしてみた。AC法ではなく、普通にTrie木で。Trie木はここの高速化前の実装を参考に。

replace.go
package cmd

import (
	"bufio"
	"fmt"
	"html"
	"io"
	"os"

	"example.com/ch08/trie"
	"github.com/spf13/cobra"
)

func init() {
	printCmd := &cobra.Command{
		Use:  "replace",
		Args: cobra.ExactArgs(1),
		RunE: func(cmd *cobra.Command, args []string) error {
			r, err := os.Open(args[0])
			if err != nil {
				return err
			}
			defer r.Close()
			words := make([]string, 0, 1000)
			rr := bufio.NewReaderSize(r, 8192)
			var word string
			for {
				line := make([]byte, 0, 10000)
				var isContinue bool = true
				for isContinue {
					bs, isPrefix, err := rr.ReadLine()
					if err == io.EOF {
						goto end
					}
					line = append(line, bs...)
					isContinue = isPrefix
				}
				word = html.UnescapeString(string(line))
				words = append(words, word)
			}
		end:
			in := bufio.NewReader(os.Stdin)
			var input string
			fmt.Fscan(in, &input)
			t := trie.NewTrie(words)

			fmt.Println(t.ReplaceAll(input, func(word string) string {
				return fmt.Sprintf(" *%s* ", word)
			}))
			return nil
		},
	}
	rootCmd.AddCommand(printCmd)
}
trie.go
package trie

type node struct {
	children map[rune]*node
	value    string
}

func newNode() *node {
	return &node{
		children: make(map[rune]*node),
	}
}

type Trie struct {
	root *node
}

func NewTrie(keys []string) *Trie {
	t := &Trie{
		root: newNode(),
	}
	for _, key := range keys {
		t.add(key)
	}
	return t
}

func (t *Trie) Match(input string, isGreedly bool) [][2]int {
	var results = make([][2]int, 0, 100)
	var r [][2]int
	chars := []rune(input)
	e := len(chars)
	b := 0
	for b < e {
		r = t.search(chars, b, isGreedly)
		n := len(r)
		if n > 0 {
			results = append(results, r...)
			b = r[n-1][0] + 1
		} else {
			b++
		}
	}
	return results
}

func (t *Trie) ReplaceAll(input string, replace func(string) string) string {
	chars := []rune(input)
	n := len(chars)
	dist := make([]rune, 0, n)
	results := t.Match(input, true)
	if len(results) == 0 {
		return input
	}
	start := 0
	for _, r := range results {
		pos, len := r[0], r[1]
		if pos < start {
			continue
		}
		dist = append(dist, chars[start:pos]...)
		dist = append(dist, []rune(replace(string(chars[pos:pos+len])))...)
		start = pos + len
	}
	if start < n {
		dist = append(dist, chars[start:]...)
	}
	return string(dist)
}

func (t *Trie) add(key string) {
	n := t.root
	for _, ch := range key {
		nn, ok := n.children[ch]
		if !ok {
			nn = newNode()
		}
		n.children[ch] = nn
		n = nn
	}
	n.value = key
}

// (開始, 長さ)
func (t *Trie) search(chars []rune, start int, isGreedly bool) [][2]int {
	end := len(chars)
	results := make([][2]int, 0, 100)
	for i := start; i < end; i++ {
		ts := make([][2]int, 0, 100)
		if n, ok := t.root.children[chars[i]]; ok {
			if n.value != "" {
				ts = append(ts, [2]int{i, 1})
			}
			for j := i + 1; j < end; j++ {
				if n, ok = n.children[chars[j]]; ok {
					if n.value != "" {
						ts = append(ts, [2]int{i, j - i + 1})
					}
				} else {
					break
				}
			}
		} else {
			break
		}
		n := len(ts)
		if isGreedly {
			if n > 0 {
				results = append(results, ts[n-1])
			}
		} else {
			results = append(results, ts...)
		}
	}
	return results
}
実行
go run . replace /path/to/keyword.utf8.uniq.txt < /path/to/input.txt
nabeyangnabeyang

テストはこんな感じで。

trie_test.go
package trie

import (
	"fmt"
	"testing"
)

func TestAdd(t *testing.T) {
	tr := NewTrie([]string{"ab", "abcde", "bc", "bab", "d"})
	expect := "abcde"
	actual := tr.root.
		children['a'].children['b'].children['c'].
		children['d'].children['e'].value
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}
	expect = "ab"
	actual = tr.root.
		children['a'].children['b'].value
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}
	expect = "bc"
	actual = tr.root.
		children['b'].children['c'].value
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}

	expect = "bab"
	actual = tr.root.
		children['b'].children['a'].children['b'].value
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}

	expect = "d"
	actual = tr.root.
		children['d'].value
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}
}

func TestSearch(t *testing.T) {
	tr := NewTrie([]string{"a", "he", "hers", "his", "she"})
	input := "a his hoge hershe"
	results := tr.Match(input, false)
	expects := [][2]int{
		{0, 1},
		{2, 3},
		{11, 2},
		{11, 4},
		{14, 3},
		{15, 2},
	}
	if len(results) != len(expects) {
		t.Fatalf("len(results) equals len(expects) but %d != %d", len(results), len(expects))
	}
	chars := []rune(input)
	for i, expect := range expects {
		r := results[i]
		j, k := r[0], r[1]
		l, m := expect[0], expect[1]
		if l != j || m != k {
			t.Errorf("expected %s:(%d, %d), but got %s:(%d, %d)", string(chars[l:l+m]), l, m, string(chars[j:j+k]), j, k)
		}
	}
}

func TestSearch2(t *testing.T) {
	tr := NewTrie([]string{"a", "he", "hers", "his", "she"})
	input := "a his hoge hershe"
	results := tr.Match(input, true)
	expects := [][2]int{
		{0, 1},
		{2, 3},
		{11, 4},
		{14, 3},
		{15, 2},
	}
	if len(results) != len(expects) {
		t.Fatalf("len(results) equals len(expects) but %d != %d", len(results), len(expects))
	}
	chars := []rune(input)
	for i, expect := range expects {
		r := results[i]
		j, k := r[0], r[1]
		l, m := expect[0], expect[1]
		if l != j || m != k {
			t.Errorf("expected %s:(%d, %d), but got %s:(%d, %d)", string(chars[l:l+m]), l, m, string(chars[j:j+k]), j, k)
		}
	}
}

func TestReplaceAll(t *testing.T) {
	tr := NewTrie([]string{"世界"})
	input := "こんにちは、世界"
	actual := tr.ReplaceAll(input, func(word string) string {
		return fmt.Sprintf("*%s*", word)
	})
	expect := "こんにちは、*世界*"
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}
}

func TestReplaceAll2(t *testing.T) {
	tr := NewTrie([]string{"he", "hers", "his", "she"})
	input := "a his hoge hershe"
	actual := tr.ReplaceAll(input, func(word string) string {
		return fmt.Sprintf("*%s*", word)
	})
	expect := "a *his* hoge *hers**he*"
	if actual != expect {
		t.Errorf("expected %s, but got %s", expect, actual)
	}
}