prefix analyse

HDT3213 · Feb 24, 2024 · 94c4356 · 94c4356
1 parent 83d771e
commit 94c4356
Show file tree

Hide file tree

Showing 16 changed files with 528 additions and 55 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,17 @@
+{
+    // 使用 IntelliSense 了解相关属性。 
+    // 悬停以查看现有属性的描述。
+    // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Launch Package",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "program": "${workspaceFolder}",
+            "args": ["-c", "prefix", "-n", "20", "-o", "tree.csv", "large.rdb"]
+        }
+
+    ]
+}
diff --git a/README.md b/README.md
@@ -50,9 +50,9 @@ use `rdb` command in terminal, you can see it's manual
 ```
 This is a tool to parse Redis' RDB files
 Options:
-  -c command, including: json/memory/aof/bigkey/flamegraph
-  -o output file path
-  -n number of result, using in 
+  -c command, including: json/memory/aof/bigkey/prefix/flamegraph
+  -o output file path, if there is no `-o` option, output to stdout
+  -n number of result, using in command: bigkey/prefix
   -port listen port for flame graph web service
   -sep separator for flamegraph, rdb will separate key by it, default value is ":". 
                 supporting multi separators: -sep sep1 -sep sep2 
@@ -69,7 +69,9 @@ parameters between '[' and ']' is optional
   rdb -c aof -o dump.aof dump.rdb
 4. get largest keys
   rdb -c bigkey [-o dump.aof] [-n 10] dump.rdb
-5. draw flamegraph
+5. get number and size by prefix
+  rdb -c prefix [-n 10] [-max-depth 3] [-o prefix-report.csv] dump.rdb
+6. draw flamegraph
   rdb -c flamegraph [-port 16379] [-sep :] dump.rdb
 ```
 
@@ -258,13 +260,13 @@ The examples for json result:
 
 RDB uses rdb encoded size to estimate redis memory usage.
 
-```
+```bash
 rdb -c memory -o <output_path> <source_path>
 ```
 
 Example:
 
-```
+```bash
 rdb -c memory -o mem.csv cases/memory.rdb
 ```
 
@@ -281,6 +283,40 @@ database,key,type,size,size_readable,element_count
 0,set,set,39,39B,2
 ```
 
+# Analyze By Prefix
+
+If you can distinguish modules based on the prefix of the key, for example, the key of user data is `User:<uid>`, the key of Post is `Post:<postid>`, the user statistics is `Stat:User:???`, and the statistics of Post is `Stat:Post:???`.Then we can get the status of each module through prefix analysis:
+
+```csv
+database,prefix,size,size_readable,key_count
+0,Post:,1170456184,1.1G,701821
+0,Stat:,405483812,386.7M,3759832
+0,Stat:Post:,291081520,277.6M,2775043
+0,User:,241572272,230.4M,265810
+0,Topic:,171146778,163.2M,694498
+0,Topic:Post:,163635096,156.1M,693758
+0,Stat:Post:View,133201208,127M,1387516
+0,Stat:User:,114395916,109.1M,984724
+0,Stat:Post:Comment:,80178504,76.5M,693758
+0,Stat:Post:Like:,77701688,74.1M,693768
+```
+
+Format:
+
+```bash
+rdb -c prefix [-n <top-n>] [-max-depth <max-depth>] -o <output_path> <source_path>
+```
+
+- The prefix analysis results are arranged in descending order of memory space. The `-n` option can specify the number of outputs. All are output by default.
+
+- `-max-depth` can limit the maximum depth of the prefix tree. In the above example, the depth of `Stat:` is 1, and the depth of `Stat:User:` and `Stat:Post:` is 2.
+
+Example:
+
+```bash
+rdb -c prefix -n -o prefix.csv cases/memory.rdb
+```
+
 # Flame Graph
 
 In many cases there is not a few very large key but lots of small keys that occupied most memory.

diff --git a/README_CN.md b/README_CN.md
@@ -269,9 +269,44 @@ database,key,type,size,size_readable,element_count
 0,set,set,39,39B,2
 ```
 
+# 前缀分析
+
+如果您可以根据 key 的前缀区分模块，比如用户数据的 key 是 `User:<uid>`， Post 的模式是 `Post:<postid>`, 用户统计信息是 `Stat:User:???`, Post 的统计信息是 `Stat:User:???`。 那么我们可以通过前缀分析来得到各模块的情况：
+
+```csv
+database,prefix,size,size_readable,key_count
+0,Post:,1170456184,1.1G,701821
+0,Stat:,405483812,386.7M,3759832
+0,Stat:Post:,291081520,277.6M,2775043
+0,User:,241572272,230.4M,265810
+0,Topic:,171146778,163.2M,694498
+0,Topic:Post:,163635096,156.1M,693758
+0,Stat:Post:View,133201208,127M,1387516
+0,Stat:User:,114395916,109.1M,984724
+0,Stat:Post:Comment:,80178504,76.5M,693758
+0,Stat:Post:Like:,77701688,74.1M,693768
+```
+
+命令格式：
+
+```bash
+rdb -c prefix [-n <top-n>] [-max-depth <max-depth>] -o <output_path> <source_path>
+```
+
+- 前缀分析结果按照内存空间从大到小排列，`-n` 选项可以指定输出的数量。默认全部输出。
+
+- `-max-depth` 可以限制前缀树的的最大深度。比如示例中 `Stat:` 的深度是1，`Stat:User:` 和 `Stat:Post:` 的深度是 2。
+
+Example:
+
+```bash
+rdb -c prefix -n -o prefix.csv cases/memory.rdb
+```
+
+
 # 火焰图
 
-在很多时候并不是少量的大键值对占据了大部分内存，而是数量巨大的小键值对消耗了很多内存。目前市面上尚无分析工具可以有效处理这个问题。
+在很多时候并不是少量的大键值对占据了大部分内存，而是数量巨大的小键值对消耗了很多内存。
 
 很多企业要求使用 Redis key 采用类似于 `user:basic.info:{userid}` 的命名规范，所以我们可以使用分隔符将 key 拆分并将拥有相同前缀的 key 聚合在一起。
 

diff --git a/cases/tree.csv b/cases/tree.csv
@@ -0,0 +1,4 @@
+database,prefix,size,size_readable,key_count
+0,a,424,424B,6
+0,ab,368,368B,5
+0,abb,232,232B,3
diff --git a/cases/tree.rdb b/cases/tree.rdb
diff --git a/cases/tree2.csv b/cases/tree2.csv
@@ -0,0 +1,4 @@
+database,prefix,size,size_readable,key_count
+0,a,424,424B,6
+0,ab,368,368B,5
+0,b,64,64B,1
diff --git a/cmd.go b/cmd.go
@@ -3,17 +3,18 @@ package main
 import (
 	"flag"
 	"fmt"
-	"github.com/hdt3213/rdb/helper"
 	"os"
 	"strings"
+
+	"github.com/hdt3213/rdb/helper"
 )
 
 const help = `
 This is a tool to parse Redis' RDB files
 Options:
-  -c command, including: json/memory/aof/bigkey/flamegraph
+  -c command, including: json/memory/aof/bigkey/prefix/flamegraph
   -o output file path
-  -n number of result, using in 
+  -n number of result, using in command: bigkey/prefix
   -port listen port for flame graph web service
   -sep separator for flamegraph, rdb will separate key by it, default value is ":". 
 		supporting multi separators: -sep sep1 -sep sep2 
@@ -30,7 +31,9 @@ parameters between '[' and ']' is optional
   rdb -c aof -o dump.aof dump.rdb
 4. get largest keys
   rdb -c bigkey [-o dump.aof] [-n 10] dump.rdb
-5. draw flamegraph
+5. get number and memory size by prefix
+  rdb -c prefix [-n 10] [-max-depth 3] [-o prefix-report.csv] dump.rdb
+6. draw flamegraph
   rdb -c flamegraph [-port 16379] [-sep :] dump.rdb
 `
 
@@ -54,9 +57,12 @@ func main() {
 	var seps separators
 	var regexExpr string
 	var noExpired bool
+	var maxDepth int
+	var err error
 	flagSet.StringVar(&cmd, "c", "", "command for rdb: json")
 	flagSet.StringVar(&output, "o", "", "output file path")
 	flagSet.IntVar(&n, "n", 0, "")
+	flagSet.IntVar(&maxDepth, "max-depth", 0, "max depth of prefix tree")
 	flagSet.IntVar(&port, "port", 0, "listen port for web")
 	flagSet.Var(&seps, "sep", "separator for flame graph")
 	flagSet.StringVar(&regexExpr, "regex", "", "regex expression")
@@ -81,7 +87,19 @@ func main() {
 		options = append(options, helper.WithNoExpiredOption())
 	}
 
-	var err error
+	var outputFile *os.File
+	if output == "" {
+		outputFile = os.Stdout
+	} else {
+		outputFile, err = os.Create(output)
+		if err != nil {
+			fmt.Printf("open output faild: %v", err)
+		}
+		defer func() {
+			_ = outputFile.Close()
+		}()
+	}
+
 	switch cmd {
 	case "json":
 		err = helper.ToJsons(src, output, options...)
@@ -90,19 +108,9 @@ func main() {
 	case "aof":
 		err = helper.ToAOF(src, output, options)
 	case "bigkey":
-		if output == "" {
-			err = helper.FindBiggestKeys(src, n, os.Stdout, options...)
-		} else {
-			var outputFile *os.File
-			outputFile, err = os.Create(output)
-			if err != nil {
-				fmt.Printf("open output faild: %v", err)
-			}
-			defer func() {
-				_ = outputFile.Close()
-			}()
-			err = helper.FindBiggestKeys(src, n, outputFile, options...)
-		}
+		err = helper.FindBiggestKeys(src, n, outputFile, options...)
+	case "prefix":
+		err = helper.PrefixAnalyse(src, n, maxDepth, outputFile, options...)
 	case "flamegraph":
 		_, err = helper.FlameGraph(src, port, seps, options...)
 		if err != nil {

diff --git a/cmd_test.go b/cmd_test.go
@@ -52,6 +52,11 @@ func TestCmd(t *testing.T) {
 	if f, _ := os.Stat("tmp/memory_regex.csv"); f == nil {
 		t.Error("command memory failed")
 	}
+	os.Args = []string{"", "-c", "prefix", "-o", "tmp/tree.csv", "cases/tree.rdb"}
+	main()
+	if f, _ := os.Stat("tmp/tree.csv"); f == nil {
+		t.Error("command prefix failed")
+	}
 
 	// test error command line
 	os.Args = []string{"", "-c", "json", "-o", "tmp/output", "/none/a"}

diff --git a/helper/bigkey.go b/helper/bigkey.go
@@ -4,37 +4,14 @@ import (
 	"encoding/csv"
 	"errors"
 	"fmt"
+	"os"
+	"strconv"
+
 	"github.com/hdt3213/rdb/bytefmt"
 	"github.com/hdt3213/rdb/core"
 	"github.com/hdt3213/rdb/model"
-	"os"
-	"sort"
-	"strconv"
 )
 
-type topList struct {
-	list     []model.RedisObject
-	capacity int
-}
-
-func (tl *topList) add(x model.RedisObject) {
-	index := sort.Search(len(tl.list), func(i int) bool {
-		return tl.list[i].GetSize() <= x.GetSize()
-	})
-	tl.list = append(tl.list, x)
-	copy(tl.list[index+1:], tl.list[index:])
-	tl.list[index] = x
-	if len(tl.list) > tl.capacity {
-		tl.list = tl.list[:tl.capacity]
-	}
-}
-
-func newRedisHeap(cap int) *topList {
-	return &topList{
-		capacity: cap,
-	}
-}
-
 // FindBiggestKeys read rdb file and find the largest N keys.
 // The invoker owns output, FindBiggestKeys won't close it
 func FindBiggestKeys(rdbFilename string, topN int, output *os.File, options ...interface{}) error {
@@ -55,7 +32,7 @@ func FindBiggestKeys(rdbFilename string, topN int, output *os.File, options ...i
 	if dec, err = wrapDecoder(dec, options...); err != nil {
 		return err
 	}
-	top := newRedisHeap(topN)
+	top := newToplist(topN)
 	err = dec.Parse(func(object model.RedisObject) bool {
 		top.add(object)
 		return true
@@ -69,7 +46,8 @@ func FindBiggestKeys(rdbFilename string, topN int, output *os.File, options ...i
 	}
 	csvWriter := csv.NewWriter(output)
 	defer csvWriter.Flush()
-	for _, object := range top.list {
+	for _, o := range top.list {
+		object := o.(model.RedisObject)
 		err = csvWriter.Write([]string{
 			strconv.Itoa(object.GetDBIndex()),
 			object.GetKey(),

diff --git a/helper/bigkey_test.go b/helper/bigkey_test.go
@@ -1,13 +1,14 @@
 package helper
 
 import (
-	"github.com/hdt3213/rdb/model"
 	"math/rand"
 	"os"
 	"path/filepath"
 	"sort"
 	"strconv"
 	"testing"
+
+	"github.com/hdt3213/rdb/model"
 )
 
 func TestTopList(t *testing.T) {
@@ -24,7 +25,7 @@ func TestTopList(t *testing.T) {
 		}
 		objects = append(objects, o)
 	}
-	topList := newRedisHeap(topN)
+	topList := newToplist(topN)
 	for _, o := range objects {
 		topList.add(o)
 	}