Skip to content
This repository was archived by the owner on May 14, 2023. It is now read-only.

Commit 45550be

Browse files
committed
add project
1 parent 9a3298c commit 45550be

File tree

19 files changed

+925
-2
lines changed

19 files changed

+925
-2
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
bin
2+
13
# Compiled Object files, Static and Dynamic libs (Shared Objects)
24
*.o
35
*.a

Makefile

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
BASE_DIR=$(shell echo $$GOPATH)/src/github.com/jdkato/aptag
2+
BUILD_DIR=./builds
3+
COMMIT= `git rev-parse --short HEAD 2>/dev/null`
4+
5+
VERSION_FILE=$(BASE_DIR)/VERSION
6+
VERSION=$(shell cat $(VERSION_FILE))
7+
8+
LDFLAGS=-ldflags "-s -w -X main.Version=$(VERSION)"
9+
10+
.PHONY: clean test lint ci cross install bump model setup
11+
12+
all: model build
13+
14+
build:
15+
go build ${LDFLAGS} -o bin/aptag ./cmd/aptag
16+
17+
build-win:
18+
go build ${LDFLAGS} -o bin/aptag.exe ./cmd/aptag
19+
20+
test:
21+
go test -v ./...
22+
23+
ci: test lint
24+
25+
lint:
26+
gometalinter --vendor --disable-all \
27+
--enable=deadcode \
28+
--enable=ineffassign \
29+
--enable=gosimple \
30+
--enable=staticcheck \
31+
--enable=gofmt \
32+
--enable=goimports \
33+
--enable=dupl \
34+
--enable=misspell \
35+
--enable=errcheck \
36+
--enable=vet \
37+
--enable=vetshadow \
38+
--deadline=1m \
39+
./...
40+
41+
setup:
42+
go get -u github.com/alecthomas/gometalinter
43+
go get -u github.com/stretchr/testify/assert
44+
go get -u github.com/urfave/cli
45+
go get -u github.com/jteeuwen/go-bindata/...
46+
go-bindata -ignore=\\.DS_Store -pkg="model" -o model/model.go model/
47+
gometalinter --install
48+
49+
bump:
50+
MAJOR=$(word 1, $(subst ., , $(CURRENT_VERSION)))
51+
MINOR=$(word 2, $(subst ., , $(CURRENT_VERSION)))
52+
PATCH=$(word 3, $(subst ., , $(CURRENT_VERSION)))
53+
VER ?= $(MAJOR).$(MINOR).$(shell echo $$(($(PATCH)+1)))
54+
55+
echo $(VER) > $(VERSION_FILE)
56+
57+
model:
58+
go-bindata -ignore=\\.DS_Store -pkg="model" -o model/model.go model/

README.md

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,62 @@
1-
# tag
2-
An English-language Part-of-Speech Tagger
1+
# aptag
2+
3+
An English-language Part-of-Speech Tagger:
4+
5+
```go
6+
import (
7+
"fmt"
8+
9+
"github.com/jdkato/aptag"
10+
)
11+
12+
func main() {
13+
text := "Dive into NLTK: Part-of-speech tagging and POS Tagger."
14+
tagger := aptag.NewPerceptronTagger()
15+
16+
tokens = tagger.TokenizeAndTag(string(text))
17+
for _, tok := range tokens {
18+
fmt.Println(tok.Text, tok.Tag)
19+
}
20+
}
21+
```
22+
23+
## Install
24+
25+
```
26+
go get github.com/jdkato/aptag
27+
```
28+
29+
## Performance
30+
31+
| Library | Accuracy | Time (sec) |
32+
|:--------|---------:|-----------:|
33+
| NLTK | 0.893 | 6.755 |
34+
| aptag | 0.961 | 2.879 |
35+
36+
(see `scripts/test_model.py`.)
37+
38+
## Notice
39+
40+
This is a port of [`textblob-aptagger`](https://github.com/sloria/textblob-aptagger):
41+
42+
```
43+
Copyright 2013 Matthew Honnibal
44+
45+
Permission is hereby granted, free of charge, to any person obtaining a copy
46+
of this software and associated documentation files (the "Software"), to deal
47+
in the Software without restriction, including without limitation the rights
48+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
49+
copies of the Software, and to permit persons to whom the Software is
50+
furnished to do so, subject to the following conditions:
51+
52+
The above copyright notice and this permission notice shall be included in
53+
all copies or substantial portions of the Software.
54+
55+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
56+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
57+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
58+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
59+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
60+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
61+
THE SOFTWARE.
62+
```

VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.1.0

aptag.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
package aptag
2+
3+
import (
4+
"regexp"
5+
"strconv"
6+
"strings"
7+
8+
"github.com/jdkato/aptag/tokenize"
9+
"gopkg.in/neurosnap/sentences.v1"
10+
"gopkg.in/neurosnap/sentences.v1/english"
11+
)
12+
13+
var none = regexp.MustCompile(`^(?:0|\*[\w?]\*|\*\-\d{1,3}|\*[A-Z]+\*\-\d{1,3}|\*)$`)
14+
var keep = regexp.MustCompile(`^\-[A-Z]{3}\-$`)
15+
16+
// PerceptronTagger ...
17+
type PerceptronTagger struct {
18+
Model *AveragedPerceptron
19+
STokenizer *sentences.DefaultSentenceTokenizer
20+
WTokenizer tokenize.WordTokenizer
21+
}
22+
23+
// NewPerceptronTagger ...
24+
func NewPerceptronTagger() *PerceptronTagger {
25+
var pt PerceptronTagger
26+
var err error
27+
28+
pt.Model = NewAveragedPerceptron()
29+
pt.STokenizer, err = english.NewSentenceTokenizer(nil)
30+
checkError(err)
31+
pt.WTokenizer = tokenize.WordTokenizerFn
32+
33+
return &pt
34+
}
35+
36+
// Tag ...
37+
func (pt PerceptronTagger) Tag(words []string) []tokenize.Token {
38+
var tokens []tokenize.Token
39+
var clean []string
40+
var tag string
41+
var found bool
42+
43+
p1, p2 := "-START-", "-START2-"
44+
context := []string{p1, p2}
45+
for _, w := range words {
46+
if w == "" {
47+
continue
48+
}
49+
context = append(context, normalize(w))
50+
clean = append(clean, w)
51+
}
52+
context = append(context, []string{"-END-", "-END2-"}...)
53+
for i, word := range clean {
54+
if none.MatchString(word) {
55+
tag = "-NONE-"
56+
} else if keep.MatchString(word) {
57+
tag = word
58+
} else if tag, found = pt.Model.TagMap[word]; !found {
59+
tag = pt.Model.Predict(featurize(i, word, context, p1, p2))
60+
}
61+
tokens = append(tokens, tokenize.Token{Tag: tag, Text: word})
62+
p2 = p1
63+
p1 = tag
64+
}
65+
66+
return tokens
67+
}
68+
69+
// TokenizeAndTag ...
70+
func (pt PerceptronTagger) TokenizeAndTag(corpus string) []tokenize.Token {
71+
var tokens []tokenize.Token
72+
for _, s := range pt.STokenizer.Tokenize(corpus) {
73+
tokens = append(tokens, pt.Tag(pt.WTokenizer(s.Text))...)
74+
}
75+
return tokens
76+
}
77+
78+
func featurize(i int, w string, ctx []string, p1 string, p2 string) map[string]float64 {
79+
feats := make(map[string]float64)
80+
suf := min(len(w), 3)
81+
i = min(len(ctx)-2, i+2)
82+
iminus := min(len(ctx[i-1]), 3)
83+
iplus := min(len(ctx[i+1]), 3)
84+
feats = add([]string{"bias"}, feats)
85+
feats = add([]string{"i suffix", w[len(w)-suf:]}, feats)
86+
feats = add([]string{"i pref1", string(w[0])}, feats)
87+
feats = add([]string{"i-1 tag", p1}, feats)
88+
feats = add([]string{"i-2 tag", p2}, feats)
89+
feats = add([]string{"i tag+i-2 tag", p1, p2}, feats)
90+
feats = add([]string{"i word", ctx[i]}, feats)
91+
feats = add([]string{"i-1 tag+i word", p1, ctx[i]}, feats)
92+
feats = add([]string{"i-1 word", ctx[i-1]}, feats)
93+
feats = add([]string{"i-1 suffix", ctx[i-1][len(ctx[i-1])-iminus:]}, feats)
94+
feats = add([]string{"i-2 word", ctx[i-2]}, feats)
95+
feats = add([]string{"i+1 word", ctx[i+1]}, feats)
96+
feats = add([]string{"i+1 suffix", ctx[i+1][len(ctx[i+1])-iplus:]}, feats)
97+
feats = add([]string{"i+2 word", ctx[i+2]}, feats)
98+
return feats
99+
}
100+
101+
func add(args []string, features map[string]float64) map[string]float64 {
102+
key := strings.Join(args, " ")
103+
if _, ok := features[key]; ok {
104+
features[key]++
105+
} else {
106+
features[key] = 1
107+
}
108+
return features
109+
}
110+
111+
func normalize(word string) string {
112+
if word == "" {
113+
return word
114+
}
115+
first := string(word[0])
116+
if strings.Contains(word, "-") && first != "-" {
117+
return "!HYPHEN"
118+
} else if _, err := strconv.Atoi(word); err == nil && len(word) == 4 {
119+
return "!YEAR"
120+
} else if _, err := strconv.Atoi(first); err == nil {
121+
return "!DIGITS"
122+
}
123+
return strings.ToLower(word)
124+
}

cmd/aptag/main.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"encoding/json"
6+
"fmt"
7+
"io/ioutil"
8+
"os"
9+
"strings"
10+
11+
"github.com/jdkato/aptag"
12+
"github.com/jdkato/aptag/tokenize"
13+
"github.com/urfave/cli"
14+
)
15+
16+
// Version is the semantic version number
17+
var Version string
18+
19+
func main() {
20+
var file string
21+
var text []byte
22+
var tags []tokenize.Token
23+
var err error
24+
var isTokenized bool
25+
26+
app := cli.NewApp()
27+
app.Name = "aptag"
28+
app.Usage = "A command-line POS tagger."
29+
app.Version = Version
30+
app.Flags = []cli.Flag{
31+
cli.StringFlag{
32+
Name: "path",
33+
Usage: "read `path` as source input instead of stdin",
34+
Destination: &file,
35+
},
36+
cli.BoolFlag{
37+
Name: "tokens",
38+
Usage: "assume input has already been tokenized",
39+
Destination: &isTokenized,
40+
},
41+
}
42+
43+
app.Action = func(c *cli.Context) error {
44+
if file != "" {
45+
text, err = ioutil.ReadFile(file)
46+
if err != nil {
47+
panic(err)
48+
}
49+
} else {
50+
stat, serr := os.Stdin.Stat()
51+
if serr != nil {
52+
panic(err)
53+
} else if (stat.Mode() & os.ModeCharDevice) == 0 {
54+
reader := bufio.NewReader(os.Stdin)
55+
text, err = ioutil.ReadAll(reader)
56+
if err != nil {
57+
panic(err)
58+
}
59+
}
60+
}
61+
if len(text) > 0 {
62+
tagger := aptag.NewPerceptronTagger()
63+
if isTokenized {
64+
tags = tagger.Tag(strings.Split(string(text), " "))
65+
} else {
66+
tags = tagger.TokenizeAndTag(string(text))
67+
}
68+
b, jerr := json.Marshal(tags)
69+
if jerr != nil {
70+
return jerr
71+
}
72+
fmt.Println(string(b))
73+
}
74+
return err
75+
}
76+
77+
if app.Run(os.Args) != nil {
78+
os.Exit(1)
79+
} else {
80+
os.Exit(0)
81+
}
82+
}

model/classes.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["EX", "NNPS", "WP$", "TO", ":", "FW", "JJR", "$", "VBN", "CC", "#", "NNP", "VBZ", "VBD", "POS", "RB", "CD", "NN", "JJ", "RBS", "MD", "VB", "``", ".", "RP", ")", ",", "(", "WRB", "RBR", "IN", "PRP$", "SYM", "DT", "VBP", "PDT", "LS", "JJS", "WDT", "UH", "VBG", "''", "NNS", "WP", "PRP"]

0 commit comments

Comments
 (0)