jdkato
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 58 additions & 0 deletions b/‎Makefile‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 62 additions & 2 deletions b/‎README.md‎
Lines changed: 62 additions & 2 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 0 deletions b/‎VERSION‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aptag.go‎
Lines changed: 124 additions & 0 deletions b/‎aptag.go‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎cmd/aptag/main.go‎
Lines changed: 82 additions & 0 deletions b/‎cmd/aptag/main.go‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎model/classes.json‎
Lines changed: 1 addition & 0 deletions b/‎model/classes.json‎
Lines changed: 1 addition & 0 deletions
@@ -1,3 +1,5 @@
+bin
+
 # Compiled Object files, Static and Dynamic libs (Shared Objects)
 *.o
 *.a
 
@@ -0,0 +1,58 @@
+BASE_DIR=$(shell echo $$GOPATH)/src/github.com/jdkato/aptag
+BUILD_DIR=./builds
+COMMIT= `git rev-parse --short HEAD 2>/dev/null`
+
+VERSION_FILE=$(BASE_DIR)/VERSION
+VERSION=$(shell cat $(VERSION_FILE))
+
+LDFLAGS=-ldflags "-s -w -X main.Version=$(VERSION)"
+
+.PHONY: clean test lint ci cross install bump model setup
+
+all: model build
+
+build:
+	go build ${LDFLAGS} -o bin/aptag ./cmd/aptag
+
+build-win:
+	go build ${LDFLAGS} -o bin/aptag.exe ./cmd/aptag
+
+test:
+	go test -v ./...
+
+ci: test lint
+
+lint:
+	gometalinter --vendor --disable-all \
+		--enable=deadcode \
+		--enable=ineffassign \
+		--enable=gosimple \
+		--enable=staticcheck \
+		--enable=gofmt \
+		--enable=goimports \
+		--enable=dupl \
+		--enable=misspell \
+		--enable=errcheck \
+		--enable=vet \
+		--enable=vetshadow \
+		--deadline=1m \
+		./...
+
+setup:
+	go get -u github.com/alecthomas/gometalinter
+	go get -u github.com/stretchr/testify/assert
+	go get -u github.com/urfave/cli
+	go get -u github.com/jteeuwen/go-bindata/...
+	go-bindata -ignore=\\.DS_Store -pkg="model" -o model/model.go model/
+	gometalinter --install
+
+bump:
+	MAJOR=$(word 1, $(subst ., , $(CURRENT_VERSION)))
+	MINOR=$(word 2, $(subst ., , $(CURRENT_VERSION)))
+	PATCH=$(word 3, $(subst ., , $(CURRENT_VERSION)))
+	VER ?= $(MAJOR).$(MINOR).$(shell echo $$(($(PATCH)+1)))
+
+	echo $(VER) > $(VERSION_FILE)
+
+model:
+	go-bindata -ignore=\\.DS_Store -pkg="model" -o model/model.go model/
@@ -1,2 +1,62 @@
-# tag
-An English-language Part-of-Speech Tagger
+# aptag
+
+An English-language Part-of-Speech Tagger:
+
+```go
+import (
+    "fmt"
+
+    "github.com/jdkato/aptag"
+)
+
+func main() {
+    text := "Dive into NLTK: Part-of-speech tagging and POS Tagger."
+    tagger := aptag.NewPerceptronTagger()
+
+    tokens = tagger.TokenizeAndTag(string(text))
+    for _, tok := range tokens {
+        fmt.Println(tok.Text, tok.Tag)
+    }
+}
+```
+
+## Install
+
+```
+go get github.com/jdkato/aptag
+```
+
+## Performance
+
+| Library | Accuracy | Time (sec) |
+|:--------|---------:|-----------:|
+| NLTK    |    0.893 |      6.755 |
+| aptag   |    0.961 |      2.879 |
+
+(see `scripts/test_model.py`.)
+
+## Notice
+
+This is a port of [`textblob-aptagger`](https://github.com/sloria/textblob-aptagger):
+
+```
+Copyright 2013 Matthew Honnibal
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+```
@@ -0,0 +1 @@
+0.1.0
@@ -0,0 +1,124 @@
+package aptag
+
+import (
+	"regexp"
+	"strconv"
+	"strings"
+
+	"github.com/jdkato/aptag/tokenize"
+	"gopkg.in/neurosnap/sentences.v1"
+	"gopkg.in/neurosnap/sentences.v1/english"
+)
+
+var none = regexp.MustCompile(`^(?:0|\*[\w?]\*|\*\-\d{1,3}|\*[A-Z]+\*\-\d{1,3}|\*)$`)
+var keep = regexp.MustCompile(`^\-[A-Z]{3}\-$`)
+
+// PerceptronTagger ...
+type PerceptronTagger struct {
+	Model      *AveragedPerceptron
+	STokenizer *sentences.DefaultSentenceTokenizer
+	WTokenizer tokenize.WordTokenizer
+}
+
+// NewPerceptronTagger ...
+func NewPerceptronTagger() *PerceptronTagger {
+	var pt PerceptronTagger
+	var err error
+
+	pt.Model = NewAveragedPerceptron()
+	pt.STokenizer, err = english.NewSentenceTokenizer(nil)
+	checkError(err)
+	pt.WTokenizer = tokenize.WordTokenizerFn
+
+	return &pt
+}
+
+// Tag ...
+func (pt PerceptronTagger) Tag(words []string) []tokenize.Token {
+	var tokens []tokenize.Token
+	var clean []string
+	var tag string
+	var found bool
+
+	p1, p2 := "-START-", "-START2-"
+	context := []string{p1, p2}
+	for _, w := range words {
+		if w == "" {
+			continue
+		}
+		context = append(context, normalize(w))
+		clean = append(clean, w)
+	}
+	context = append(context, []string{"-END-", "-END2-"}...)
+	for i, word := range clean {
+		if none.MatchString(word) {
+			tag = "-NONE-"
+		} else if keep.MatchString(word) {
+			tag = word
+		} else if tag, found = pt.Model.TagMap[word]; !found {
+			tag = pt.Model.Predict(featurize(i, word, context, p1, p2))
+		}
+		tokens = append(tokens, tokenize.Token{Tag: tag, Text: word})
+		p2 = p1
+		p1 = tag
+	}
+
+	return tokens
+}
+
+// TokenizeAndTag ...
+func (pt PerceptronTagger) TokenizeAndTag(corpus string) []tokenize.Token {
+	var tokens []tokenize.Token
+	for _, s := range pt.STokenizer.Tokenize(corpus) {
+		tokens = append(tokens, pt.Tag(pt.WTokenizer(s.Text))...)
+	}
+	return tokens
+}
+
+func featurize(i int, w string, ctx []string, p1 string, p2 string) map[string]float64 {
+	feats := make(map[string]float64)
+	suf := min(len(w), 3)
+	i = min(len(ctx)-2, i+2)
+	iminus := min(len(ctx[i-1]), 3)
+	iplus := min(len(ctx[i+1]), 3)
+	feats = add([]string{"bias"}, feats)
+	feats = add([]string{"i suffix", w[len(w)-suf:]}, feats)
+	feats = add([]string{"i pref1", string(w[0])}, feats)
+	feats = add([]string{"i-1 tag", p1}, feats)
+	feats = add([]string{"i-2 tag", p2}, feats)
+	feats = add([]string{"i tag+i-2 tag", p1, p2}, feats)
+	feats = add([]string{"i word", ctx[i]}, feats)
+	feats = add([]string{"i-1 tag+i word", p1, ctx[i]}, feats)
+	feats = add([]string{"i-1 word", ctx[i-1]}, feats)
+	feats = add([]string{"i-1 suffix", ctx[i-1][len(ctx[i-1])-iminus:]}, feats)
+	feats = add([]string{"i-2 word", ctx[i-2]}, feats)
+	feats = add([]string{"i+1 word", ctx[i+1]}, feats)
+	feats = add([]string{"i+1 suffix", ctx[i+1][len(ctx[i+1])-iplus:]}, feats)
+	feats = add([]string{"i+2 word", ctx[i+2]}, feats)
+	return feats
+}
+
+func add(args []string, features map[string]float64) map[string]float64 {
+	key := strings.Join(args, " ")
+	if _, ok := features[key]; ok {
+		features[key]++
+	} else {
+		features[key] = 1
+	}
+	return features
+}
+
+func normalize(word string) string {
+	if word == "" {
+		return word
+	}
+	first := string(word[0])
+	if strings.Contains(word, "-") && first != "-" {
+		return "!HYPHEN"
+	} else if _, err := strconv.Atoi(word); err == nil && len(word) == 4 {
+		return "!YEAR"
+	} else if _, err := strconv.Atoi(first); err == nil {
+		return "!DIGITS"
+	}
+	return strings.ToLower(word)
+}
@@ -0,0 +1,82 @@
+package main
+
+import (
+	"bufio"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"strings"
+
+	"github.com/jdkato/aptag"
+	"github.com/jdkato/aptag/tokenize"
+	"github.com/urfave/cli"
+)
+
+// Version is the semantic version number
+var Version string
+
+func main() {
+	var file string
+	var text []byte
+	var tags []tokenize.Token
+	var err error
+	var isTokenized bool
+
+	app := cli.NewApp()
+	app.Name = "aptag"
+	app.Usage = "A command-line POS tagger."
+	app.Version = Version
+	app.Flags = []cli.Flag{
+		cli.StringFlag{
+			Name:        "path",
+			Usage:       "read `path` as source input instead of stdin",
+			Destination: &file,
+		},
+		cli.BoolFlag{
+			Name:        "tokens",
+			Usage:       "assume input has already been tokenized",
+			Destination: &isTokenized,
+		},
+	}
+
+	app.Action = func(c *cli.Context) error {
+		if file != "" {
+			text, err = ioutil.ReadFile(file)
+			if err != nil {
+				panic(err)
+			}
+		} else {
+			stat, serr := os.Stdin.Stat()
+			if serr != nil {
+				panic(err)
+			} else if (stat.Mode() & os.ModeCharDevice) == 0 {
+				reader := bufio.NewReader(os.Stdin)
+				text, err = ioutil.ReadAll(reader)
+				if err != nil {
+					panic(err)
+				}
+			}
+		}
+		if len(text) > 0 {
+			tagger := aptag.NewPerceptronTagger()
+			if isTokenized {
+				tags = tagger.Tag(strings.Split(string(text), " "))
+			} else {
+				tags = tagger.TokenizeAndTag(string(text))
+			}
+			b, jerr := json.Marshal(tags)
+			if jerr != nil {
+				return jerr
+			}
+			fmt.Println(string(b))
+		}
+		return err
+	}
+
+	if app.Run(os.Args) != nil {
+		os.Exit(1)
+	} else {
+		os.Exit(0)
+	}
+}
@@ -0,0 +1 @@
+["EX", "NNPS", "WP$", "TO", ":", "FW", "JJR", "$", "VBN", "CC", "#", "NNP", "VBZ", "VBD", "POS", "RB", "CD", "NN", "JJ", "RBS", "MD", "VB", "``", ".", "RP", ")", ",", "(", "WRB", "RBR", "IN", "PRP$", "SYM", "DT", "VBP", "PDT", "LS", "JJS", "WDT", "UH", "VBG", "''", "NNS", "WP", "PRP"]
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+bin`
	`2`	`+`
`1`	`3`	`# Compiled Object files, Static and Dynamic libs (Shared Objects)`
`2`	`4`	`*.o`
`3`	`5`	`*.a`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+["EX", "NNPS", "WP$", "TO", ":", "FW", "JJR", "$", "VBN", "CC", "#", "NNP", "VBZ", "VBD", "POS", "RB", "CD", "NN", "JJ", "RBS", "MD", "VB", "``", ".", "RP", ")", ",", "(", "WRB", "RBR", "IN", "PRP$", "SYM", "DT", "VBP", "PDT", "LS", "JJS", "WDT", "UH", "VBG", "''", "NNS", "WP", "PRP"]