Skip to content

Commit

Permalink
preparing for release
Browse files Browse the repository at this point in the history
  • Loading branch information
fmatter committed Nov 5, 2023
1 parent 6edfbf8 commit 4fe3be1
Show file tree
Hide file tree
Showing 12 changed files with 106 additions and 56 deletions.
12 changes: 9 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added
* `new` command
* text record detail view

### Changed
* reduced height of list items
* configuration in a dict & optional values
* organize by filename if no `Text_ID`
* better feedback

## [0.1.1] - 2023-11-04

### Fixed
* text loading, smoother interface overall

## [0.1.0] - 2023-XX-XX
## [0.1.0] - 2023-11-04

Initial release

[Unreleased]: https://github.com/fmatter/lingcorp/compare/v0.0.1...HEAD
[0.1.1]: https://github.com/fmatter/lingcorp/compare/v0.0.1...v0.0.1
[Unreleased]: https://github.com/fmatter/lingcorp/compare/v0.1.1...HEAD
[0.1.1]: https://github.com/fmatter/lingcorp/compare/v0.1.0...v0.1.1
[0.1.0]: https://github.com/fmatter/lingcorp/commit/insert_this_by_hand
6 changes: 6 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# lingcorp
Coming soon.

## Corpus annotation
Your project directory should contain `input` and `output` directories, as well as a `conf.py` file.
That configuration file describes the annotation setup.
lingcorp requires two variables: `pipeline` and `config`.

### The pipeline
The pipeline (called so for historical reasons) is a list of configurable fields and dynamic annotator objects.

## Concordance search
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pygraid = "^0.1.0"
parsimonious = "^0.10.0"
pandas = "^2.0.1"
writio = "^0.1.0"
cookiecutter = "^2.4.0"

[tool.poetry.group.dev.dependencies]
keepachangelog = "^1.0.0"
Expand Down
13 changes: 3 additions & 10 deletions src/lingcorp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@

class Annotator:
def __init__(self, name="unnamed", **kwargs):
"""The parse method takes a text record, does something to it, then returns it."""
self.name = name
self.annotated_path = Path(f"{self.name}.yaml")
self.annotated = load(self.annotated_path)

def parse(self, record):
"""Placeholder function. Returns a record with added data"""
"""The parse method takes a text record, does something to it, then returns it."""
return record

def save(self):
Expand Down Expand Up @@ -67,14 +68,7 @@ def __init__(
def parse(self, rec):
rec[self.output_col] = ortho_strip(
rec[self.src], replace=self.replace, strip=self.strip
) # (
# self.ortho_strip(
# rec[self.src], exceptions=[","], additions=["%", "¿", "###", "#"]
# )
# .replace(" ", " ")
# .replace(" ", " ")
# .strip(" ")
# )
)
return rec


Expand Down Expand Up @@ -207,7 +201,6 @@ def parse(self, record):
return record

def register_choice(self, record_id, pos, obj, choice):
# print("AH HA!", record_id, pos, obj, choice, self.annotated_path)
self.annotated.setdefault(record_id, {})
self.annotated[record_id].setdefault(int(pos), {})
self.annotated[record_id][int(pos)][
Expand Down
45 changes: 32 additions & 13 deletions src/lingcorp/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
from lingcorp.config import INPUT_DIR, OUTPUT_DIR
from lingcorp.helpers import get_pos, load_data, run_pipeline

import logging
import sys
from pathlib import Path

import click
import colorlog
from cookiecutter.exceptions import OutputDirExistsException
from cookiecutter.main import cookiecutter

import lingcorp
from lingcorp.config import INPUT_DIR, OUTPUT_DIR
from lingcorp.helpers import get_pos, load_data, run_pipeline

handler = colorlog.StreamHandler(None)
handler.setFormatter(
colorlog.ColoredFormatter("%(log_color)s%(levelname)-7s%(reset)s %(message)s")
)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
log.propagate = False
global log
log = logging.getLogger()
log.setLevel(logging.INFO)
log.propagate = True
log.addHandler(handler)


Expand All @@ -33,7 +37,12 @@ def main():
def cli(limit, text):
from conf import config, pipeline, pos_list

parse_csvs(pipeline, config.get("output_file", "parsed.csv"), config.get("filter", {}), pos_list)
parse_csvs(
pipeline,
config.get("output_file", "parsed.csv"),
config.get("filter", {}),
pos_list,
)


@main.command()
Expand All @@ -42,15 +51,25 @@ def web():

run_server()


@main.command()
@click.argument("name")
def new(name):
try:
cookiecutter(
str(Path(lingcorp.__file__).parent / "project_template"),
output_dir=name,
)
except OutputDirExistsException as e:
print(e)
print("Run with --force option to overwrite!")
raise ValueError()


def parse_csvs(pipeline, out_f, filter_params=None, pos_list=None):
fields = {x["key"]: x for x in pipeline if isinstance(x, dict)}
data = load_data(
rename={
"Primary_Text": "ort",
"Translated_Text": "oft",
"Speaker_ID": "spk",
"Text_ID": "txt",
},
fields=fields,
filter_params=filter_params,
)
annotations = {}
Expand Down
18 changes: 13 additions & 5 deletions src/lingcorp/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lingcorp.config import INPUT_DIR

SEC_JOIN = ","
SEP = "\t"

uniparser_fields = {
"wf": "srf",
Expand All @@ -31,11 +32,14 @@
ud_pos = ["v"]


def load_data(rename={}, filter_params={}):
log.info("Loading data")
def load_data(fields={}, filter_params={}):
log.info("Loading data...")
dfs = []
for file in INPUT_DIR.glob("*.csv"):
dfs.append(load(file, index_col="ID"))
filelist = list(INPUT_DIR.glob("*.csv"))
for file in tqdm(filelist, "Scanning input directory"):
df = load(file, index_col="ID")
df["filename"] = file.name
dfs.append(df)
if not dfs:
return None
data = pd.concat(dfs)
Expand All @@ -44,7 +48,11 @@ def load_data(rename={}, filter_params={}):
data = data[data[k] == v[0]]
else:
data = data[data[k] == v]
data.rename(columns=rename, inplace=True)
for key, field_data in fields.items():
if field_data.get("label", None) in data.columns:
data.rename(columns={field_data["label"]: key}, inplace=True)
if field_data.get("lvl", None) == "word":
data[key] = data[key].apply(lambda x: x.split(SEP))
data["ID"] = data.index
return data

Expand Down
4 changes: 3 additions & 1 deletion src/lingcorp/project_template/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
{
"project_name": "atest"
"project_name": "",
"audio_directory": ""

}
15 changes: 11 additions & 4 deletions src/lingcorp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pygraid
from conf import config
from tqdm import tqdm
from writio import dump, load

from lingcorp.cql import parse
Expand Down Expand Up @@ -232,7 +233,9 @@ def build_conc_line(
postto = len(record[target_col])
post = slice(end + 1, postto)
if mode == "rich":
link = config.get("rec_link", "http://localhost:6543/sentences/{rec_id}").format(rec_id=record["rec"])
link = config.get(
"rec_link", "http://localhost:5001/example/{rec_id}"
).format(rec_id=record["rec"])
if link:
rec_text = f"""<a href="{link}">{record["rec"]}</a>"""
else:
Expand Down Expand Up @@ -380,15 +383,17 @@ def query(
if i >= len(alternatives):
return f"Invalid query: '{query_string}'"
roundtrip = " ".join(str(x) for x in tokens)
print(f"Query: {roundtrip} == {query_string}")
log.info(f"Searching for {query_string} ({roundtrip})")
rec_dics = {}
for i, rec in enumerate(self.to_dict("records")):
i = 0
for rec in tqdm(self.to_dict("records"), desc="Preparing word items"):
rec_dics[i] = []
for idx, dic in self.iter_words(rec, self.aligned_cols):
other_dic = {col: rec[col] for col in self.record_level if col in rec}
rec_dics[i].append({**dic, **{"idx": idx, "i": i}, **other_dic})
i += 1
kwics = []
for rec_idx, word_dics in rec_dics.items():
for rec_idx, word_dics in tqdm(rec_dics.items(), desc="Building concordance"):
start = None
i = 0
j = 0
Expand Down Expand Up @@ -445,10 +450,12 @@ def query(
# "legend": f"Search results for {roundtrip}",
# }
# )
log.info("Rendering HTML...")
res = kwics.to_html(index=False, escape=False)
if name:
self.conc_dir.mkdir(exist_ok=True, parents=True)
dump(res, f"{self.conc_dir}/{name}.html")
log.info("Finished query search")
return res
elif conc_mode == "csv":
self.conc_dir.mkdir(exist_ok=True, parents=True)
Expand Down
42 changes: 25 additions & 17 deletions src/lingcorp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
AUDIO_PATH = Path(config.get("audio_path", ""))

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
# log.setLevel(logging.DEBUG)


app = Flask(__name__, static_url_path="/static")
Expand Down Expand Up @@ -77,15 +77,7 @@ def parse_graid(df, aligned_fields, target="all"):


fields = {x["key"]: x for x in pipeline if isinstance(x, dict)}

data = load_data(
rename={
"Primary_Text": "ort",
"Translated_Text": "oft",
"Speaker_ID": "spk",
"Text_ID": "txt",
}
)
data = load_data(fields=fields)

texts = None
if data is not None:
Expand All @@ -96,6 +88,7 @@ def parse_graid(df, aligned_fields, target="all"):

annotations = {}
data = run_pipeline(data, annotations, pipeline, pos_list)

data.index = data["ID"]
audios = []
for x in AUDIO_PATH.iterdir():
Expand All @@ -119,8 +112,12 @@ def parse_graid(df, aligned_fields, target="all"):
texts = {}
if "graid" in data.columns:
data = pd.DataFrame.from_dict(parse_graid(data, aligned_fields))
for text_id, textdata in data.groupby("txt"):
texts[text_id] = list(textdata.index)
for target in ["txt", "filename", "Language_ID"]:
if target in data.columns:
for text_id, textdata in data.groupby(target):
texts[text_id] = list(textdata.index)
break
log.info("Annotation setup completed")


def save():
Expand Down Expand Up @@ -154,10 +151,22 @@ def reparse(ex_id, target):
return data.loc[ex_id]


@app.route("/example/<exid>")
def example_detail(exid):
ex = data.loc[exid]
field_data = {"precord": {}, "record": {}, "word": {}, "translations": {}}
for key, field in fields.items():
if key not in ex:
continue
field_data.setdefault(field["lvl"], {})
field_data[field["lvl"]][key] = field
return render_template(
"rich_record.html", ex=ex, fields=field_data, top_align="ann"
)


@app.route("/example")
def example():
if data is None:
return "None"
exid = request.args.get("id")
ex = data.loc[exid]
field_data = {"precord": {}, "record": {}, "word": {}, "translations": {}}
Expand Down Expand Up @@ -185,7 +194,7 @@ def get_output():
for f in Path(OUTPUT_DIR).iterdir():
if f.suffix == ".csv":
res.append(f.name)
return res
return sorted(res)


@app.route("/texts")
Expand Down Expand Up @@ -349,10 +358,9 @@ def get_conc_fields():

@app.route("/search")
def search():
print(request.args.get("query"))
print(request.args.get("filename"))
query = json.loads(request.args.get("query"))
filename = json.loads(request.args.get("filename"))
log.info(f"Loading {filename}...")
df = CorpusFrame(f"output/{filename}", list_cols=["mid", "grm"])
return df.query(query, name=None, mode="rich")

Expand Down
2 changes: 1 addition & 1 deletion src/lingcorp/static/js/annotation.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ $(function () {
data: { value: res, target: id },
success: function (data) {
$.ajax({
url: "/example",
url: "/example/",
data: { id: exampleItem.id },
success: function (result) {
result = $.parseHTML(result);
Expand Down
2 changes: 1 addition & 1 deletion src/lingcorp/templates/annotation.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{% extends 'base.html' %}

{% block title %}
concordance search
corpus annotation
{% endblock %}

{% block css %}
Expand Down
2 changes: 1 addition & 1 deletion src/lingcorp/templates/record.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
{% if field.get("edit") %}
<div class="{{key}}"><input type="text" value="{{ ex[key] }}" id="{{ex['ID']}}_{{key}}"></div>
{% else %}
<span class="{{key}}">{{ ex[key] }}</span>
<div><span class="{{key}}">{{ ex[key] }}</span></div>
{% endif %}
{% endfor %}
{% for item in ex["obj"] %}
Expand Down

0 comments on commit 4fe3be1

Please sign in to comment.