Skip to content
This repository was archived by the owner on May 14, 2023. It is now read-only.

Commit 238c333

Browse files
committed
refactor: update structure
1 parent 37b6dcb commit 238c333

File tree

13 files changed

+1239
-103
lines changed

13 files changed

+1239
-103
lines changed

.travis.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
language: go
2+
go:
3+
- 1.7
4+
- 1.8
5+
matrix:
6+
include:
7+
- os: linux
8+
env: ARCH="i686"
9+
- os: linux
10+
env: ARCH="x86_64"
11+
- os: osx
12+
env: ARCH="x86_64"
13+
osx_image: xcode8
14+
install:
15+
- make setup
16+
- make build
17+
script:
18+
- make test

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@ build:
1717
build-win:
1818
go build ${LDFLAGS} -o bin/prose.exe ./cmd/prose
1919

20-
test:
20+
test-tokenize:
21+
python3 scripts/treebank_words.py
2122
go test -v ./tokenize
23+
24+
test: test-tokenize
25+
2226
ci: test lint
2327

2428
lint:
@@ -35,7 +39,7 @@ lint:
3539
--enable=vet \
3640
--enable=vetshadow \
3741
--deadline=1m \
38-
./...
42+
./tokenize ./tag
3943

4044
setup:
4145
go get -u github.com/jteeuwen/go-bindata/...

internal/util/util.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,31 @@
11
package util
22

3-
import "github.com/jdkato/prose/internal/model"
3+
import (
4+
"io/ioutil"
5+
"path/filepath"
46

7+
"github.com/jdkato/prose/internal/model"
8+
)
9+
10+
// ReadDataFile reads data from a file, panicking on any errors.
11+
func ReadDataFile(path string) []byte {
12+
p, err := filepath.Abs(path)
13+
CheckError(err)
14+
15+
data, ferr := ioutil.ReadFile(p)
16+
CheckError(ferr)
17+
18+
return data
19+
}
20+
21+
// CheckError panics if `err` is not `nil`.
522
func CheckError(err error) {
623
if err != nil {
724
panic(err)
825
}
926
}
1027

28+
// Min returns the minimum of `a` and `b`.
1129
func Min(a, b int) int {
1230
if a < b {
1331
return a
@@ -45,6 +63,7 @@ func IsAlnum(c byte) bool {
4563
return (c >= '0' && c <= '9') || IsLetter(c)
4664
}
4765

66+
// GetAsset returns the named Asset.
4867
func GetAsset(name string) []byte {
4968
b, err := model.Asset("model/" + name)
5069
CheckError(err)

scripts/treebank_words.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
import pathlib
3+
4+
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
5+
6+
if __name__ == '__main__':
7+
t = TreebankWordTokenizer()
8+
with open(pathlib.PurePath('testdata', 'tokenize.json')) as d:
9+
data = json.load(d)
10+
11+
words = []
12+
sents = []
13+
for text in data:
14+
for s in sent_tokenize(text):
15+
sents.append(s)
16+
words.append(t.tokenize(s))
17+
18+
with open(pathlib.PurePath('testdata', 'treebank_words.json'), 'w') as f:
19+
json.dump(words, f, indent=4)
20+
21+
with open(pathlib.PurePath('testdata', 'treebank_sents.json'), 'w') as f:
22+
json.dump(sents, f, indent=4)

testdata/tokenize.json

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
[
2+
"They'll save and invest more.",
3+
"How's it going?",
4+
"abbreviations like M.D. and initials containing periods, they",
5+
"hi, my name can't hello,",
6+
"Hello World. My name is Jonas.",
7+
"There it is! I found it.",
8+
"My name is Jonas E. Smith.",
9+
"At eight o'clock on Thursday morning ... Arthur didn't feel very good.",
10+
"Please turn to p. 55.",
11+
"Were Jane and co. at the party?",
12+
"They closed the deal with Pitt, Briggs & Co. at noon.",
13+
"Let's ask Jane and co. They should know.",
14+
"They closed the deal with Pitt, Briggs & Co. It closed yesterday.",
15+
"I can see Mt. Fuji from here.",
16+
"St. Michael's Church is on 5th st. near the light.",
17+
"That is JFK Jr.'s book.",
18+
"I visited the U.S.A. last year.",
19+
"I live in the E.U. How about you?",
20+
"I live in the U.S. How about you?",
21+
"I work for the U.S. Government in Virginia.",
22+
"I have lived in the U.S. for 20 years.",
23+
"At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
24+
"She has $100.00 in her bag.",
25+
"She has $100.00. It is in her bag.",
26+
"He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
27+
"Her email is [email protected]. I sent her an email.",
28+
"The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
29+
"She turned to him, 'This is great.' she said.",
30+
"She turned to him, \"This is great.\" she said.",
31+
"She turned to him, \"This is great.\" She held the book out to show him.",
32+
"Hello!! Long time no see.",
33+
"Hello?? Who is there?",
34+
"Hello!? Is that you?",
35+
"Hello?! Is that you?",
36+
"1.) The first item 2.) The second item",
37+
"1.) The first item. 2.) The second item.",
38+
"1) The first item 2) The second item",
39+
"1) The first item. 2) The second item.",
40+
"1. The first item 2. The second item",
41+
"1. The first item. 2. The second item.",
42+
"• 9. The first item • 10. The second item",
43+
"⁃9. The first item ⁃10. The second item",
44+
"a. The first item b. The second item c. The third list item",
45+
"This is a sentence\ncut off in the middle because pdf.",
46+
"It was a cold \nnight in the city.",
47+
"features\ncontact manager\nevents, activities\n",
48+
"You can find it at N°. 1026.253.553. That is where the treasure is.",
49+
"She works at Yahoo! in the accounting department.",
50+
"We make a good team, you and I. Did you see Albert I. Jones yesterday?",
51+
"Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
52+
"\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).",
53+
"If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
54+
"I never meant that.... She left the store.",
55+
"I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
56+
"One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
57+
"Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot."
58+
]

testdata/treebank_sents.json

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
[
2+
"They'll save and invest more.",
3+
"How's it going?",
4+
"abbreviations like M.D.",
5+
"and initials containing periods, they",
6+
"hi, my name can't hello,",
7+
"Hello World.",
8+
"My name is Jonas.",
9+
"There it is!",
10+
"I found it.",
11+
"My name is Jonas E. Smith.",
12+
"At eight o'clock on Thursday morning ... Arthur didn't feel very good.",
13+
"Please turn to p. 55.",
14+
"Were Jane and co. at the party?",
15+
"They closed the deal with Pitt, Briggs & Co. at noon.",
16+
"Let's ask Jane and co.",
17+
"They should know.",
18+
"They closed the deal with Pitt, Briggs & Co.",
19+
"It closed yesterday.",
20+
"I can see Mt.",
21+
"Fuji from here.",
22+
"St. Michael's Church is on 5th st. near the light.",
23+
"That is JFK Jr.'s book.",
24+
"I visited the U.S.A. last year.",
25+
"I live in the E.U.",
26+
"How about you?",
27+
"I live in the U.S. How about you?",
28+
"I work for the U.S. Government in Virginia.",
29+
"I have lived in the U.S. for 20 years.",
30+
"At 5 a.m. Mr. Smith went to the bank.",
31+
"He left the bank at 6 P.M. Mr. Smith then went to the store.",
32+
"She has $100.00 in her bag.",
33+
"She has $100.00.",
34+
"It is in her bag.",
35+
"He teaches science (He previously worked for 5 years as an engineer.)",
36+
"at the local University.",
37+
"Her email is [email protected].",
38+
"I sent her an email.",
39+
"The site is: https://www.example.50.com/new-site/awesome_content.html.",
40+
"Please check it out.",
41+
"She turned to him, 'This is great.'",
42+
"she said.",
43+
"She turned to him, \"This is great.\"",
44+
"she said.",
45+
"She turned to him, \"This is great.\"",
46+
"She held the book out to show him.",
47+
"Hello!!",
48+
"Long time no see.",
49+
"Hello??",
50+
"Who is there?",
51+
"Hello!?",
52+
"Is that you?",
53+
"Hello?!",
54+
"Is that you?",
55+
"1.)",
56+
"The first item 2.)",
57+
"The second item",
58+
"1.)",
59+
"The first item.",
60+
"2.)",
61+
"The second item.",
62+
"1) The first item 2) The second item",
63+
"1) The first item.",
64+
"2) The second item.",
65+
"1.",
66+
"The first item 2.",
67+
"The second item",
68+
"1.",
69+
"The first item.",
70+
"2.",
71+
"The second item.",
72+
"\u2022 9.",
73+
"The first item \u2022 10.",
74+
"The second item",
75+
"\u20439.",
76+
"The first item \u204310.",
77+
"The second item",
78+
"a.",
79+
"The first item b.",
80+
"The second item c. The third list item",
81+
"This is a sentence\ncut off in the middle because pdf.",
82+
"It was a cold \nnight in the city.",
83+
"features\ncontact manager\nevents, activities",
84+
"You can find it at N\u00b0.",
85+
"1026.253.553.",
86+
"That is where the treasure is.",
87+
"She works at Yahoo!",
88+
"in the accounting department.",
89+
"We make a good team, you and I.",
90+
"Did you see Albert I. Jones yesterday?",
91+
"Thoreau argues that by simplifying one\u2019s life, \u201cthe laws of the universe will appear less complex.",
92+
".",
93+
".",
94+
".\u201d",
95+
"\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).",
96+
"If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period .",
97+
".",
98+
".",
99+
".",
100+
"Next sentence.",
101+
"I never meant that.... She left the store.",
102+
"I wasn\u2019t really ... well, what I mean...see .",
103+
".",
104+
".",
105+
"what I'm saying, the thing is .",
106+
".",
107+
".",
108+
"I didn\u2019t mean it.",
109+
"One further habit which was somewhat weakened .",
110+
".",
111+
".",
112+
"was that of combining words into self-interpreting compounds.",
113+
".",
114+
".",
115+
".",
116+
"The practice was not abandoned.",
117+
".",
118+
".",
119+
".",
120+
"Hello world.Today is Tuesday.Mr.",
121+
"Smith went to the store and bought 1,000.That is a lot."
122+
]

0 commit comments

Comments
 (0)