preparing for release

fmatter · Nov 5, 2023 · 4fe3be1 · 4fe3be1
1 parent 6edfbf8
commit 4fe3be1
Show file tree

Hide file tree

Showing 12 changed files with 106 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,19 +6,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+* `new` command
+* text record detail view
+
 ### Changed
 * reduced height of list items
 * configuration in a dict & optional values
+* organize by filename if no `Text_ID`
+* better feedback
 
 ## [0.1.1] - 2023-11-04
 
 ### Fixed
 * text loading, smoother interface overall
 
-## [0.1.0] - 2023-XX-XX
+## [0.1.0] - 2023-11-04
 
 Initial release
 
-[Unreleased]: https://github.com/fmatter/lingcorp/compare/v0.0.1...HEAD
-[0.1.1]: https://github.com/fmatter/lingcorp/compare/v0.0.1...v0.0.1
+[Unreleased]: https://github.com/fmatter/lingcorp/compare/v0.1.1...HEAD
+[0.1.1]: https://github.com/fmatter/lingcorp/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/fmatter/lingcorp/commit/insert_this_by_hand
diff --git a/docs/index.md b/docs/index.md
@@ -1,6 +1,12 @@
 # lingcorp
+Coming soon.
 
 ## Corpus annotation
+Your project directory should contain `input` and `output` directories, as well as a `conf.py` file.
+That configuration file describes the annotation setup.
+lingcorp requires two variables: `pipeline` and `config`.
 
+### The pipeline
+The pipeline (called so for historical reasons) is a list of configurable fields and dynamic annotator objects.
 
 ## Concordance search
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ pygraid = "^0.1.0"
 parsimonious = "^0.10.0"
 pandas = "^2.0.1"
 writio = "^0.1.0"
+cookiecutter = "^2.4.0"
 
 [tool.poetry.group.dev.dependencies]
 keepachangelog = "^1.0.0"

diff --git a/src/lingcorp/annotator.py b/src/lingcorp/annotator.py
@@ -16,12 +16,13 @@
 
 class Annotator:
     def __init__(self, name="unnamed", **kwargs):
+        """The parse method takes a text record, does something to it, then returns it."""
         self.name = name
         self.annotated_path = Path(f"{self.name}.yaml")
         self.annotated = load(self.annotated_path)
 
     def parse(self, record):
-        """Placeholder function. Returns a record with added data"""
+        """The parse method takes a text record, does something to it, then returns it."""
         return record
 
     def save(self):
@@ -67,14 +68,7 @@ def __init__(
     def parse(self, rec):
         rec[self.output_col] = ortho_strip(
             rec[self.src], replace=self.replace, strip=self.strip
-        )  # (
-        #     self.ortho_strip(
-        #         rec[self.src], exceptions=[","], additions=["%", "¿", "###", "#"]
-        #     )
-        #     .replace("  ", " ")
-        #     .replace("  ", " ")
-        #     .strip(" ")
-        # )
+        )
         return rec
 
 
@@ -207,7 +201,6 @@ def parse(self, record):
         return record
 
     def register_choice(self, record_id, pos, obj, choice):
-        # print("AH HA!", record_id, pos, obj, choice, self.annotated_path)
         self.annotated.setdefault(record_id, {})
         self.annotated[record_id].setdefault(int(pos), {})
         self.annotated[record_id][int(pos)][

diff --git a/src/lingcorp/cli/__init__.py b/src/lingcorp/cli/__init__.py
@@ -1,20 +1,24 @@
-from lingcorp.config import INPUT_DIR, OUTPUT_DIR
-from lingcorp.helpers import get_pos, load_data, run_pipeline
-
 import logging
 import sys
 from pathlib import Path
 
 import click
 import colorlog
+from cookiecutter.exceptions import OutputDirExistsException
+from cookiecutter.main import cookiecutter
+
+import lingcorp
+from lingcorp.config import INPUT_DIR, OUTPUT_DIR
+from lingcorp.helpers import get_pos, load_data, run_pipeline
 
 handler = colorlog.StreamHandler(None)
 handler.setFormatter(
     colorlog.ColoredFormatter("%(log_color)s%(levelname)-7s%(reset)s %(message)s")
 )
-log = logging.getLogger(__name__)
-log.setLevel(logging.DEBUG)
-log.propagate = False
+global log
+log = logging.getLogger()
+log.setLevel(logging.INFO)
+log.propagate = True
 log.addHandler(handler)
 
 
@@ -33,7 +37,12 @@ def main():
 def cli(limit, text):
     from conf import config, pipeline, pos_list
 
-    parse_csvs(pipeline, config.get("output_file", "parsed.csv"), config.get("filter", {}), pos_list)
+    parse_csvs(
+        pipeline,
+        config.get("output_file", "parsed.csv"),
+        config.get("filter", {}),
+        pos_list,
+    )
 
 
 @main.command()
@@ -42,15 +51,25 @@ def web():
 
     run_server()
 
+
+@main.command()
+@click.argument("name")
+def new(name):
+    try:
+        cookiecutter(
+            str(Path(lingcorp.__file__).parent / "project_template"),
+            output_dir=name,
+        )
+    except OutputDirExistsException as e:
+        print(e)
+        print("Run with --force option to overwrite!")
+        raise ValueError()
+
+
 def parse_csvs(pipeline, out_f, filter_params=None, pos_list=None):
     fields = {x["key"]: x for x in pipeline if isinstance(x, dict)}
     data = load_data(
-        rename={
-            "Primary_Text": "ort",
-            "Translated_Text": "oft",
-            "Speaker_ID": "spk",
-            "Text_ID": "txt",
-        },
+        fields=fields,
         filter_params=filter_params,
     )
     annotations = {}

diff --git a/src/lingcorp/helpers.py b/src/lingcorp/helpers.py
@@ -13,6 +13,7 @@
 from lingcorp.config import INPUT_DIR
 
 SEC_JOIN = ","
+SEP = "\t"
 
 uniparser_fields = {
     "wf": "srf",
@@ -31,11 +32,14 @@
 ud_pos = ["v"]
 
 
-def load_data(rename={}, filter_params={}):
-    log.info("Loading data")
+def load_data(fields={}, filter_params={}):
+    log.info("Loading data...")
     dfs = []
-    for file in INPUT_DIR.glob("*.csv"):
-        dfs.append(load(file, index_col="ID"))
+    filelist = list(INPUT_DIR.glob("*.csv"))
+    for file in tqdm(filelist, "Scanning input directory"):
+        df = load(file, index_col="ID")
+        df["filename"] = file.name
+        dfs.append(df)
     if not dfs:
         return None
     data = pd.concat(dfs)
@@ -44,7 +48,11 @@ def load_data(rename={}, filter_params={}):
             data = data[data[k] == v[0]]
         else:
             data = data[data[k] == v]
-    data.rename(columns=rename, inplace=True)
+    for key, field_data in fields.items():
+        if field_data.get("label", None) in data.columns:
+            data.rename(columns={field_data["label"]: key}, inplace=True)
+            if field_data.get("lvl", None) == "word":
+                data[key] = data[key].apply(lambda x: x.split(SEP))
     data["ID"] = data.index
     return data
 

diff --git a/src/lingcorp/project_template/cookiecutter.json b/src/lingcorp/project_template/cookiecutter.json
@@ -1,3 +1,5 @@
 {
-  "project_name": "atest"
+  "project_name": "",
+  "audio_directory": ""
+
 }
diff --git a/src/lingcorp/search.py b/src/lingcorp/search.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pygraid
 from conf import config
+from tqdm import tqdm
 from writio import dump, load
 
 from lingcorp.cql import parse
@@ -232,7 +233,9 @@ def build_conc_line(
             postto = len(record[target_col])
         post = slice(end + 1, postto)
         if mode == "rich":
-            link = config.get("rec_link", "http://localhost:6543/sentences/{rec_id}").format(rec_id=record["rec"])
+            link = config.get(
+                "rec_link", "http://localhost:5001/example/{rec_id}"
+            ).format(rec_id=record["rec"])
             if link:
                 rec_text = f"""<a href="{link}">{record["rec"]}</a>"""
             else:
@@ -380,15 +383,17 @@ def query(
             if i >= len(alternatives):
                 return f"Invalid query: '{query_string}'"
         roundtrip = " ".join(str(x) for x in tokens)
-        print(f"Query: {roundtrip} == {query_string}")
+        log.info(f"Searching for {query_string} ({roundtrip})")
         rec_dics = {}
-        for i, rec in enumerate(self.to_dict("records")):
+        i = 0
+        for rec in tqdm(self.to_dict("records"), desc="Preparing word items"):
             rec_dics[i] = []
             for idx, dic in self.iter_words(rec, self.aligned_cols):
                 other_dic = {col: rec[col] for col in self.record_level if col in rec}
                 rec_dics[i].append({**dic, **{"idx": idx, "i": i}, **other_dic})
+            i += 1
         kwics = []
-        for rec_idx, word_dics in rec_dics.items():
+        for rec_idx, word_dics in tqdm(rec_dics.items(), desc="Building concordance"):
             start = None
             i = 0
             j = 0
@@ -445,10 +450,12 @@ def query(
                 #         "legend": f"Search results for {roundtrip}",
                 #     }
                 # )
+                log.info("Rendering HTML...")
                 res = kwics.to_html(index=False, escape=False)
                 if name:
                     self.conc_dir.mkdir(exist_ok=True, parents=True)
                     dump(res, f"{self.conc_dir}/{name}.html")
+                log.info("Finished query search")
                 return res
             elif conc_mode == "csv":
                 self.conc_dir.mkdir(exist_ok=True, parents=True)

diff --git a/src/lingcorp/server.py b/src/lingcorp/server.py
@@ -26,7 +26,7 @@
 AUDIO_PATH = Path(config.get("audio_path", ""))
 
 log = logging.getLogger(__name__)
-log.setLevel(logging.DEBUG)
+# log.setLevel(logging.DEBUG)
 
 
 app = Flask(__name__, static_url_path="/static")
@@ -77,15 +77,7 @@ def parse_graid(df, aligned_fields, target="all"):
 
 
 fields = {x["key"]: x for x in pipeline if isinstance(x, dict)}
-
-data = load_data(
-    rename={
-        "Primary_Text": "ort",
-        "Translated_Text": "oft",
-        "Speaker_ID": "spk",
-        "Text_ID": "txt",
-    }
-)
+data = load_data(fields=fields)
 
 texts = None
 if data is not None:
@@ -96,6 +88,7 @@ def parse_graid(df, aligned_fields, target="all"):
 
     annotations = {}
     data = run_pipeline(data, annotations, pipeline, pos_list)
+
     data.index = data["ID"]
     audios = []
     for x in AUDIO_PATH.iterdir():
@@ -119,8 +112,12 @@ def parse_graid(df, aligned_fields, target="all"):
     texts = {}
     if "graid" in data.columns:
         data = pd.DataFrame.from_dict(parse_graid(data, aligned_fields))
-    for text_id, textdata in data.groupby("txt"):
-        texts[text_id] = list(textdata.index)
+    for target in ["txt", "filename", "Language_ID"]:
+        if target in data.columns:
+            for text_id, textdata in data.groupby(target):
+                texts[text_id] = list(textdata.index)
+            break
+log.info("Annotation setup completed")
 
 
 def save():
@@ -154,10 +151,22 @@ def reparse(ex_id, target):
     return data.loc[ex_id]
 
 
+@app.route("/example/<exid>")
+def example_detail(exid):
+    ex = data.loc[exid]
+    field_data = {"precord": {}, "record": {}, "word": {}, "translations": {}}
+    for key, field in fields.items():
+        if key not in ex:
+            continue
+        field_data.setdefault(field["lvl"], {})
+        field_data[field["lvl"]][key] = field
+    return render_template(
+        "rich_record.html", ex=ex, fields=field_data, top_align="ann"
+    )
+
+
 @app.route("/example")
 def example():
-    if data is None:
-        return "None"
     exid = request.args.get("id")
     ex = data.loc[exid]
     field_data = {"precord": {}, "record": {}, "word": {}, "translations": {}}
@@ -185,7 +194,7 @@ def get_output():
     for f in Path(OUTPUT_DIR).iterdir():
         if f.suffix == ".csv":
             res.append(f.name)
-    return res
+    return sorted(res)
 
 
 @app.route("/texts")
@@ -349,10 +358,9 @@ def get_conc_fields():
 
 @app.route("/search")
 def search():
-    print(request.args.get("query"))
-    print(request.args.get("filename"))
     query = json.loads(request.args.get("query"))
     filename = json.loads(request.args.get("filename"))
+    log.info(f"Loading {filename}...")
     df = CorpusFrame(f"output/{filename}", list_cols=["mid", "grm"])
     return df.query(query, name=None, mode="rich")
 

diff --git a/src/lingcorp/static/js/annotation.js b/src/lingcorp/static/js/annotation.js
@@ -86,7 +86,7 @@ $(function () {
         data: { value: res, target: id },
         success: function (data) {
           $.ajax({
-            url: "/example",
+            url: "/example/",
             data: { id: exampleItem.id },
             success: function (result) {
               result = $.parseHTML(result);

diff --git a/src/lingcorp/templates/annotation.html b/src/lingcorp/templates/annotation.html
@@ -1,7 +1,7 @@
 {% extends 'base.html' %}
 
 {% block title %}
-concordance search
+corpus annotation
 {% endblock %}
 
 {% block css %}

diff --git a/src/lingcorp/templates/record.html b/src/lingcorp/templates/record.html
@@ -7,7 +7,7 @@
                     {% if field.get("edit") %}
                     <div class="{{key}}"><input type="text" value="{{ ex[key] }}" id="{{ex['ID']}}_{{key}}"></div>
                     {% else %}
-                    <span class="{{key}}">{{ ex[key] }}</span>
+                    <div><span class="{{key}}">{{ ex[key] }}</span></div>
                     {% endif %}
                 {% endfor %}
                 {% for item in ex["obj"] %}