diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ae800f8..0f94f91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: strategy: matrix: - go: [ '1.19.x', '1.20.x' ] + go: [ '1.21.x', '1.22.x' ] steps: - name: Checkout diff --git a/README.md b/README.md index f4b10da..25e951e 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,31 @@ [![demo](https://gh.kaos.st/uc-110.gif)](#usage-demo) +### Benchmarks + +``` +$ wc -l data.txt +18408096 data.txt + +$ hyperfine 'sort -u data.txt | wc -l' 'uc -np data.txt' +Benchmark 1: sort -u data.txt | wc -l + Time (mean ± σ): 16.030 s ± 0.181 s [User: 86.713 s, System: 1.165 s] + Range (min … max): 15.699 s … 16.324 s 10 runs + +Benchmark 2: uc -np data.txt + Time (mean ± σ): 2.889 s ± 0.101 s [User: 2.435 s, System: 0.454 s] + Range (min … max): 2.721 s … 3.065 s 10 runs + +Summary + uc -np data.txt ran + 5.55 ± 0.20 times faster than sort -u data.txt | wc -l +``` + ### Installation #### From sources -To build the `uc` from scratch, make sure you have a working Go 1.19+ workspace (_[instructions](https://go.dev/doc/install)_), then: +To build the `uc` from scratch, make sure you have a working Go 1.20+ workspace (_[instructions](https://go.dev/doc/install)_), then: ``` go install github.com/essentialkaos/uc@latest @@ -82,10 +102,9 @@ Usage: uc {options} file Options - --dist, -d Show number of occurrences for every line + --dist, -d format Show number of occurrences for every line (-/simple/table/json) --max, -m num Max number of unique lines --no-progress, -np Disable progress output - --no-progress, -np Disable progress output --no-color, -nc Disable colors in output --help, -h Show this help message --version, -v Show version @@ -98,6 +117,9 @@ Examples uc -d file.txt Show distribution for file.txt + uc --dist=table file.txt + Show distribution as a table for file.txt + uc -d -m 5k file.txt Show distribution for file.txt with 5,000 uniq lines max diff --git a/cli/cli.go b/cli/cli.go index f0319c3..ac611be 100644 --- a/cli/cli.go +++ b/cli/cli.go @@ -2,7 +2,7 @@ package cli // ////////////////////////////////////////////////////////////////////////////////// // // // -// Copyright (c) 2023 ESSENTIAL KAOS // +// Copyright (c) 2024 ESSENTIAL KAOS // // Apache License, Version 2.0 // // // // ////////////////////////////////////////////////////////////////////////////////// // @@ -10,7 +10,6 @@ package cli import ( "bufio" "fmt" - "hash/crc64" "os" "runtime" "sort" @@ -21,7 +20,9 @@ import ( "github.com/essentialkaos/ek/v12/fmtc" "github.com/essentialkaos/ek/v12/fmtutil" + "github.com/essentialkaos/ek/v12/fmtutil/table" "github.com/essentialkaos/ek/v12/fsutil" + "github.com/essentialkaos/ek/v12/mathutil" "github.com/essentialkaos/ek/v12/options" "github.com/essentialkaos/ek/v12/signal" "github.com/essentialkaos/ek/v12/strutil" @@ -33,6 +34,8 @@ import ( "github.com/essentialkaos/ek/v12/usage/man" "github.com/essentialkaos/ek/v12/usage/update" + "github.com/cespare/xxhash" + "github.com/essentialkaos/uc/cli/support" ) @@ -41,7 +44,7 @@ import ( // Application basic info const ( APP = "uc" - VER = "2.0.1" + VER = "3.0.0" DESC = "Tool for counting unique lines" ) @@ -68,8 +71,8 @@ const MAX_SAMPLE_SIZE = 512 // Stats contains data info type Stats struct { - Counters map[uint64]uint32 // crc64 → num - Samples map[uint64]string // crc64 → sample (512 symbols) + Counters map[uint64]uint32 // hash → num + Samples map[uint64][]byte // hash → sample (512 symbols) LastReadLines uint64 LastReadBytes float64 TotalReadLines uint64 @@ -101,7 +104,7 @@ func (s linesSlice) Less(i, j int) bool { // optMap is map with options var optMap = options.Map{ OPT_MAX_LINES: {Type: options.INT}, - OPT_DISTRIBUTION: {Type: options.BOOL}, + OPT_DISTRIBUTION: {Type: options.MIXED}, OPT_NO_PROGRESS: {Type: options.BOOL}, OPT_NO_COLOR: {Type: options.BOOL}, OPT_HELP: {Type: options.BOOL}, @@ -173,6 +176,10 @@ func preConfigureUI() { fmtc.DisableColors = true rawMode = true } + + table.FullScreen = false + table.HeaderCapitalize = true + table.BorderSymbol = "–" } // configureUI configures user interface @@ -195,8 +202,6 @@ func configureUI() { // processData starts data processing func processData(args options.Arguments) { - var r *bufio.Reader - stats = &Stats{ Counters: make(map[uint64]uint32), mx: &sync.Mutex{}, @@ -205,19 +210,18 @@ func processData(args options.Arguments) { input := getInput(args) if input == "-" { - r = bufio.NewReader(os.Stdin) - } else { - fd, err := os.OpenFile(input, os.O_RDONLY, 0) + readData(bufio.NewScanner(os.Stdin)) + return + } - if err != nil { - printError(err.Error()) - os.Exit(1) - } + fd, err := os.OpenFile(input, os.O_RDONLY, 0) - r = bufio.NewReader(fd) + if err != nil { + printError(err.Error()) + os.Exit(1) } - readData(bufio.NewScanner(r)) + readData(bufio.NewScanner(fd)) } // getInput returns input for reading data @@ -239,7 +243,6 @@ func getInput(args options.Arguments) string { // readData reads data func readData(s *bufio.Scanner) { - ct := crc64.MakeTable(crc64.ECMA) dist := options.GetB(OPT_DISTRIBUTION) maxLines, err := parseMaxLines(options.GetS(OPT_MAX_LINES)) @@ -249,7 +252,7 @@ func readData(s *bufio.Scanner) { } if dist { - stats.Samples = make(map[uint64]string) + stats.Samples = make(map[uint64][]byte) } stats.LastReadDate = time.Now() @@ -261,7 +264,7 @@ func readData(s *bufio.Scanner) { for s.Scan() { data := s.Bytes() dataLen := float64(len(data)) - dataCrc := crc64.Checksum(data, ct) + dataCrc := xxhash.Sum64(data) stats.mx.Lock() @@ -276,7 +279,7 @@ func readData(s *bufio.Scanner) { _, exist := stats.Samples[dataCrc] if !exist { - stats.Samples[dataCrc] = strutil.Substr(string(data), 0, MAX_SAMPLE_SIZE) + stats.Samples[dataCrc] = data[:mathutil.Min(len(data), MAX_SAMPLE_SIZE)] } } @@ -343,11 +346,62 @@ func printDistribution() { distData = append(distData, LineInfo{crc, num}) } + fmtc.TPrintf("") + sort.Sort(sort.Reverse(distData)) - for _, info := range distData { - fmtc.TPrintf(" %7d %s\n", info.Num, stats.Samples[info.CRC]) + switch options.GetS(OPT_DISTRIBUTION) { + case "simple": + printDistributionSimple(distData) + case "table": + printDistributionTable(distData) + case "json": + printDistributionJSON(distData) + default: + printDistributionDefault(distData) + } +} + +// printDistributionDefault prints distribution info in default format +func printDistributionDefault(data linesSlice) { + for _, info := range data { + fmtc.Printf(" %7d %s\n", info.Num, string(stats.Samples[info.CRC])) + } +} + +// printDistributionSimple prints distribution info in simple format +func printDistributionSimple(data linesSlice) { + for _, info := range data { + fmtc.Printf("%d %s\n", info.Num, string(stats.Samples[info.CRC])) + } +} + +// printDistributionTable prints distribution info as a table +func printDistributionTable(data linesSlice) { + t := table.NewTable("#", "DATA") + + for _, info := range data { + t.Add(fmtutil.PrettyNum(info.Num), string(stats.Samples[info.CRC])) } + + t.Render() +} + +// printDistributionTable prints distribution info in JSON format +func printDistributionJSON(data linesSlice) { + fmt.Println("[") + + for index, info := range data { + fmt.Printf(` {"num":%d, "data":"%s"}`, info.Num, string(stats.Samples[info.CRC])) + + if index+1 != len(data) { + fmt.Println(",") + } else { + fmt.Println("") + } + } + + fmt.Println("]") } // parseMaxLines parses max line option @@ -423,16 +477,16 @@ func genUsage() *usage.Info { info.AppNameColorTag = colorTagApp - info.AddOption(OPT_DISTRIBUTION, "Show number of occurrences for every line") + info.AddOption(OPT_DISTRIBUTION, "Show number of occurrences for every line {s-}(-/simple/table/json){!}", "?format") info.AddOption(OPT_MAX_LINES, "Max number of unique lines", "num") info.AddOption(OPT_NO_PROGRESS, "Disable progress output") - info.AddOption(OPT_NO_PROGRESS, "Disable progress output") info.AddOption(OPT_NO_COLOR, "Disable colors in output") info.AddOption(OPT_HELP, "Show this help message") info.AddOption(OPT_VER, "Show version") info.AddExample("file.txt", "Count unique lines in file.txt") info.AddExample("-d file.txt", "Show distribution for file.txt") + info.AddExample("--dist=table file.txt", "Show distribution as a table for file.txt") info.AddExample("-d -m 5k file.txt", "Show distribution for file.txt with 5,000 uniq lines max") info.AddRawExample("cat file.txt | "+APP, "Count unique lines in stdin data") diff --git a/cli/support/support.go b/cli/support/support.go index 74070a5..588ea54 100644 --- a/cli/support/support.go +++ b/cli/support/support.go @@ -2,7 +2,7 @@ package support // ////////////////////////////////////////////////////////////////////////////////// // // // -// Copyright (c) 2023 ESSENTIAL KAOS // +// Copyright (c) 2024 ESSENTIAL KAOS // // Apache License, Version 2.0 // // // // ////////////////////////////////////////////////////////////////////////////////// // diff --git a/common/uc.spec b/common/uc.spec index 8d2ec89..4c68874 100644 --- a/common/uc.spec +++ b/common/uc.spec @@ -6,7 +6,7 @@ Summary: Simple utility for counting unique lines Name: uc -Version: 2.0.1 +Version: 3.0.0 Release: 0%{?dist} Group: Applications/System License: Apache License, Version 2.0 @@ -16,7 +16,7 @@ Source0: https://source.kaos.st/%{name}/%{name}-%{version}.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -BuildRequires: golang >= 1.19 +BuildRequires: golang >= 1.20 Provides: %{name} = %{version}-%{release} @@ -54,6 +54,34 @@ install -pm 755 %{name}/%{name} %{buildroot}%{_bindir}/ %clean rm -rf %{buildroot} +%post +if [[ -d %{_sysconfdir}/bash_completion.d ]] ; then + %{name} --completion=bash 1> %{_sysconfdir}/bash_completion.d/%{name} 2>/dev/null +fi + +if [[ -d %{_datarootdir}/fish/vendor_completions.d ]] ; then + %{name} --completion=fish 1> %{_datarootdir}/fish/vendor_completions.d/%{name}.fish 2>/dev/null +fi + +if [[ -d %{_datadir}/zsh/site-functions ]] ; then + %{name} --completion=zsh 1> %{_datadir}/zsh/site-functions/_%{name} 2>/dev/null +fi + +%postun +if [[ $1 == 0 ]] ; then + if [[ -f %{_sysconfdir}/bash_completion.d/%{name} ]] ; then + rm -f %{_sysconfdir}/bash_completion.d/%{name} &>/dev/null || : + fi + + if [[ -f %{_datarootdir}/fish/vendor_completions.d/%{name}.fish ]] ; then + rm -f %{_datarootdir}/fish/vendor_completions.d/%{name}.fish &>/dev/null || : + fi + + if [[ -f %{_datadir}/zsh/site-functions/_%{name} ]] ; then + rm -f %{_datadir}/zsh/site-functions/_%{name} &>/dev/null || : + fi +fi + ################################################################################ %files @@ -65,6 +93,12 @@ rm -rf %{buildroot} ################################################################################ %changelog +* Mon Feb 19 2024 Anton Novojilov - 3.0.0-0 +- crc64 replaced by xxhash +- Added different output formats for distribution info +- Code refactoring +- Dependencies update + * Tue Dec 19 2023 Anton Novojilov - 2.0.1-0 - Dependencies update - Code refactoring diff --git a/go.mod b/go.mod index a6c425a..a0e754a 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,9 @@ module github.com/essentialkaos/uc go 1.18 require ( + github.com/cespare/xxhash v1.1.0 github.com/essentialkaos/depsy v1.1.0 - github.com/essentialkaos/ek/v12 v12.92.0 + github.com/essentialkaos/ek/v12 v12.100.0 ) -require golang.org/x/sys v0.15.0 // indirect +require golang.org/x/sys v0.17.0 // indirect diff --git a/go.sum b/go.sum index 70b6add..d27293c 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,16 @@ +github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE= +github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= +github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= +github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/essentialkaos/check v1.4.0 h1:kWdFxu9odCxUqo1NNFNJmguGrDHgwi3A8daXX1nkuKk= github.com/essentialkaos/depsy v1.1.0 h1:U6dp687UkQwXlZU17Hg2KMxbp3nfZAoZ8duaeUFYvJI= github.com/essentialkaos/depsy v1.1.0/go.mod h1:kpiTAV17dyByVnrbNaMcZt2jRwvuXClUYOzpyJQwtG8= -github.com/essentialkaos/ek/v12 v12.92.0 h1:3JIkHWNA6MNkJOfqzMWJ8jN9sRM7nRi7URoFRVFHZzI= -github.com/essentialkaos/ek/v12 v12.92.0/go.mod h1:9efMqo1S8EtYhmeelOSTmMQDGC2vRgPkjkKKfvUD2eU= +github.com/essentialkaos/ek/v12 v12.100.0 h1:bup8cqsUUXJtKHAgdt2eHjYKBFU/0rPbg/us2H1E46I= +github.com/essentialkaos/ek/v12 v12.100.0/go.mod h1:VjMWDJ1r4HsfBYJuCNDUo4R1lhCgYkFZOMOH0S3W6iM= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= -golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= -golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ= +github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/uc.go b/uc.go index d08f46b..963fa14 100644 --- a/uc.go +++ b/uc.go @@ -2,7 +2,7 @@ package main // ////////////////////////////////////////////////////////////////////////////////// // // // -// Copyright (c) 2023 ESSENTIAL KAOS // +// Copyright (c) 2024 ESSENTIAL KAOS // // Apache License, Version 2.0 // // // // ////////////////////////////////////////////////////////////////////////////////// //