Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use stanza's sentence segmentation, add support for newer stanza versions and drop support for Python <3.8 #103

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ jobs:
fail-fast: true
matrix:
os: [ubuntu-latest]
python_version: ["3.7", "3.11"]
python_version: ["3.8", "3.11"]

runs-on: ${{ matrix.os }}

steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Configure Python version
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python_version }}
architecture: x64
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
spacy>=3.0.0,<4.0.0
stanza>=1.2.0,<1.7.0
stanza>=1.2.0, <2.0.0
# Development dependencies
pytest>=5.2.0
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def setup_package():
version=about["__version__"],
license=about["__license__"],
packages=find_packages(),
install_requires=["spacy>=3.0.0,<4.0.0", "stanza>=1.2.0,<1.7.0"],
python_requires=">=3.6",
install_requires=["spacy>=3.0.0,<4.0.0", "stanza>=1.2.0, <2.0.0"],
python_requires=">=3.8",
entry_points={
"spacy_tokenizers": [
"spacy_stanza.PipelineAsTokenizer.v1 = spacy_stanza:tokenizer.create_tokenizer",
Expand All @@ -43,8 +43,6 @@ def setup_package():
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
Expand Down
4 changes: 3 additions & 1 deletion spacy_stanza/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def load_pipeline(
lang: str = "",
dir: Optional[str] = None,
package: str = "default",
processors: Union[dict, str] = {},
processors: Union[dict, str] = None,
logging_level: Optional[Union[int, str]] = None,
verbose: Optional[bool] = None,
use_gpu: bool = True,
Expand All @@ -32,6 +32,8 @@ def load_pipeline(
**kwargs: Options for the individual stanza processors.
RETURNS (Language): The nlp object.
"""
if processors is None:
processors = {}
# Create an empty config skeleton
config = {"nlp": {"tokenizer": {"kwargs": {}}}}
if lang == "":
Expand Down
49 changes: 40 additions & 9 deletions spacy_stanza/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ def create_tokenizer(
lang: str = "",
dir: Optional[str] = None,
package: str = "default",
processors: Union[dict, str] = {},
processors: Union[dict, str] = None,
logging_level: Optional[Union[int, str]] = None,
verbose: Optional[bool] = None,
use_gpu: bool = True,
kwargs: dict = {},
kwargs: dict = None,
):
if processors is None:
processors = {}
if kwargs is None:
kwargs = {}

def tokenizer_factory(
nlp,
lang=lang,
Expand Down Expand Up @@ -82,16 +87,14 @@ def __call__(self, text):

snlp_doc = self.snlp(text)
text = snlp_doc.text
snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc)
words = []
spaces = []
snlp_tokens, snlp_heads, snlp_sent_starts = self.get_tokens_with_heads(snlp_doc)
pos = []
tags = []
morphs = []
deps = []
heads = []
lemmas = []
offset = 0
sent_starts = []
token_texts = [t.text for t in snlp_tokens]
is_aligned = True
try:
Expand All @@ -117,6 +120,7 @@ def __call__(self, text):
morphs.append("")
deps.append("")
lemmas.append(word)
sent_starts.append(False)

# increment any heads left of this position that point beyond
# this position to the right (already present in heads)
Expand All @@ -141,7 +145,7 @@ def __call__(self, text):
else:
token = snlp_tokens[i + offset]
assert word == token.text

sent_starts.append(snlp_sent_starts[i + offset])
pos.append(token.upos or "")
tags.append(token.xpos or token.upos or "")
morphs.append(token.feats or "")
Expand All @@ -158,6 +162,7 @@ def __call__(self, text):
morphs=morphs,
lemmas=lemmas,
deps=deps,
sent_starts=sent_starts,
heads=[head + i for i, head in enumerate(heads)],
)
ents = []
Expand Down Expand Up @@ -199,8 +204,11 @@ def get_tokens_with_heads(self, snlp_doc):
"""
tokens = []
heads = []
sent_starts = []
offset = 0
token_idx = 0
for sentence in snlp_doc.sentences:
first = True
for token in sentence.tokens:
for word in token.words:
# Here, we're calculating the absolute token index in the doc,
Expand All @@ -212,8 +220,31 @@ def get_tokens_with_heads(self, snlp_doc):
head = 0
heads.append(head)
tokens.append(word)
if first:
sent_starts.append(True)
first = False
else:
sent_starts.append(False)
token_idx += 1
offset += sum(len(token.words) for token in sentence.tokens)
return tokens, heads
return tokens, heads, sent_starts

@staticmethod
def get_sentences(snlp_doc):
"""Extract the sentences from the Stanza Doc.

snlp_doc (stanza.Document): The processed Stanza doc.
RETURNS (list): The sentences.
"""
sentences = []
offset = 0
for sentence in snlp_doc.sentences:
words = []
for token in sentence.tokens:
words.extend([word.text for word in token.words])
sentences.append("".join(words))
offset += len(words)
return sentences

def get_words_and_spaces(self, words, text):
if "".join("".join(words).split()) != "".join(text.split()):
Expand Down Expand Up @@ -242,7 +273,7 @@ def get_words_and_spaces(self, words, text):
if text_pos < len(text):
text_words.append(text[text_pos:])
text_spaces.append(False)
return (text_words, text_spaces)
return text_words, text_spaces

def token_vector(self, token):
"""Get Stanza's pretrained word embedding for given token.
Expand Down