add support for anki

marph91 · Oct 3, 2024 · 8cb263b · 8cb263b
1 parent 853a010
commit 8cb263b
Show file tree

Hide file tree

Showing 8 changed files with 203 additions and 3 deletions.
diff --git a/docs/formats/anki.md b/docs/formats/anki.md
@@ -0,0 +1,29 @@
+This page describes how to convert Anki cards to Markdown.
+
+## General Information
+
+- [Website](https://apps.ankiweb.net/)
+- Typical extension: `.apkg`
+
+## Instructions
+
+1. Export as described [at the wiki](https://docs.ankiweb.net/exporting.html)
+    1. Choose "Deck (.apkg)"
+2. [Install jimmy](../index.md#installation)
+3. Convert to Markdown. Example: `jimmy-cli-linux MEILLEUR_DECK_ANGLAIS_3000.apkg --format anki`
+4. [Import to your app](../import_instructions.md)
+
+## Import Structure
+
+- Decks are converted to folders.
+- Cards are converted to Markdown files. The content is the "answer" data.
+- Referenced resources (audio, images and other files) are converted.
+
+A converted page looks like:
+
+![](../images/anki_markdown_example.png)
+
+## Known Limitations
+
+- Nested decks are not tested and most likely not working.
+- HTML formatting is lost. It's too complext to cover all HTML templates properly.
diff --git a/docs/images/anki_markdown_example.png b/docs/images/anki_markdown_example.png
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -52,6 +52,7 @@ nav:
   - index.md
   - Formats:
     - Default Import: formats/default.md
+    - Anki: formats/anki.md
     - Bear: formats/bear.md
     - Cacher: formats/cacher.md
     - CherryTree: formats/cherrytree.md
@@ -86,7 +87,7 @@ nav:
   - Additional features:
     - Filters: additional_features/filters.md
     - Show the Note Tree: additional_features/show_note_tree.md
-    - Miscellaneous: miscellaneous.md
+    - Miscellaneous: additional_features/miscellaneous.md
   - Contributing:
     - How to Contribute?: contributing/contributing.md
     - More Note Apps: contributing/more_note_apps.md
diff --git a/src/converter.py b/src/converter.py
@@ -32,7 +32,7 @@ def prepare_input(self, input_: Path) -> Path:
                 return common.get_single_child_folder(temp_folder)
             case ".jex" | ".tgz" | ".tar.gz":
                 return common.extract_tar(input_)
-            case ".nsx" | ".zip" | ".zkn3":
+            case ".apkg" | ".nsx" | ".zip" | ".zkn3":
                 return common.extract_zip(input_)
             case _:  # ".textbundle", folder
                 return input_

diff --git a/src/formats/anki.py b/src/formats/anki.py
@@ -0,0 +1,163 @@
+"""Convert Anki cards to the intermediate format."""
+
+from pathlib import Path
+import json
+import re
+import sqlite3
+
+import converter
+import intermediate_format as imf
+
+
+IMAGE_RE = re.compile(r"(<img src=\"(.*?)\"(?:>| >| \/>))")
+SOUND_RE = re.compile(r"(\[.*?:(.*?)\])")
+
+
+def get_images(body: str) -> list[tuple[str, str]]:
+    """
+    >>> get_images('<img src="awake-55ab4bc5f5.jpg">')
+    [('<img src="awake-55ab4bc5f5.jpg">', 'awake-55ab4bc5f5.jpg')]
+    >>> get_images('<img src="prepositions_14.jpg" />')
+    [('<img src="prepositions_14.jpg" />', 'prepositions_14.jpg')]
+    """
+    return IMAGE_RE.findall(body)
+
+
+def get_sounds(body: str) -> list[tuple[str, str]]:
+    """
+    >>> get_sounds("[sound:rec1430907056.mp3]")
+    [('[sound:rec1430907056.mp3]', 'rec1430907056.mp3')]
+    """
+    return SOUND_RE.findall(body)
+
+
+class Converter(converter.BaseConverter):
+    accepted_extensions = [".apkg"]
+
+    def convert(self, file_or_folder: Path):
+        # TODO
+        # pylint: disable=too-many-locals
+
+        if (self.root_path / "collection.anki21").is_file():
+            db_file = self.root_path / "collection.anki21"
+        elif (self.root_path / "collection.anki2").is_file():
+            db_file = self.root_path / "collection.anki2"
+        else:
+            self.logger.error("Couldn't find note database.")
+            return
+
+        media_dict = json.loads((self.root_path / "media").read_text(encoding="utf-8"))
+        # switch keys and values, because we need the names in the notes later
+        media_dict = {v: k for k, v in media_dict.items()}
+
+        conn = sqlite3.connect(db_file)
+        cur = conn.cursor()
+
+        # collection
+        # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure#collection
+        collection = list(cur.execute("select * from col"))[0]
+        if collection[4] != 11:
+            self.logger.warning(
+                f"Only tested with version 11. Got version {collection[4]}"
+            )
+
+        # models
+        # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure#models-jsonobjects
+        models = json.loads(collection[9])
+
+        # decks
+        # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure#decks-jsonobjects
+        decks = json.loads(collection[10])
+        # TODO: nested decks
+        for deck_id, deck in decks.items():
+            self.root_notebook.child_notebooks.append(
+                imf.Notebook(deck["name"], original_id=str(deck_id))
+            )
+
+        # cards
+        # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure#cards
+        note_deck_id_map = {}
+        for note_id, deck_id in cur.execute("select nid, did from cards"):
+            note_deck_id_map[str(note_id)] = str(deck_id)
+
+        # notes
+        # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure#notes
+        for note_index, (
+            created,
+            original_id,
+            model_id,
+            updated,
+            tags,
+            data,
+        ) in enumerate(cur.execute("select id, guid, mid, mod, tags, flds from notes")):
+            model = models[str(model_id)]
+            template_replacements = dict(
+                zip([f["name"] for f in model["flds"]], data.split("\x1f"))
+            )
+
+            # TODO: Templates are too complex for pandoc conversion.
+            # Just take the replacements for now.
+            # def replace(templ, replacements):
+            #     for key, value in replacements.items():
+            #         templ = templ.replace(f"{{{{{key}}}}}", value + " ")
+            #     return templ
+
+            # for template in model["tmpls"]:
+            #     front = replace(template["qfmt"], template_replacements)
+            #     template_replacements["FrontSide"] = front
+
+            #     # treat the backside as complete note
+            #     back = (
+            #         model["css"]
+            #         + "\n\n"
+            #         + replace(template["afmt"], template_replacements)
+            #     )
+            #     body = markdown_lib.common.markup_to_markdown(back)
+            body_md = "\n".join(
+                [f"- {key}: {value}" for key, value in template_replacements.items()]
+            )
+            # cleanup
+            body_md = (
+                body_md.replace("<br>\n", "\n")
+                .replace("&nbsp;", " ")
+                .replace("<div>", "")
+                .replace("</div>", "")
+            )
+
+            # find images, sounds and other attachments
+            resources = []
+            for text, filename_note in get_images(body_md) + get_sounds(body_md):
+                resources.append(
+                    imf.Resource(
+                        self.root_path / media_dict[filename_note],
+                        text,
+                        filename_note,
+                    )
+                )
+
+            note_imf = imf.Note(
+                # TODO: Anki doesn't have note names. Find a robust note name.
+                # The index is a bit better readeable than the original_id.
+                f"note_{note_index:010}",
+                body_md,
+                original_id=str(original_id),
+                created=created,
+                updated=updated,
+                resources=resources,
+                tags=[imf.Tag(t) for t in tags.strip().split(" ") if t],
+            )
+
+            found_parent_notebook = False
+            parent_deck_id = note_deck_id_map.get(str(created))
+            for notebook in self.root_notebook.child_notebooks:
+                if notebook.original_id == parent_deck_id:
+                    notebook.child_notes.append(note_imf)
+                    found_parent_notebook = True
+                    break
+            if not found_parent_notebook:
+                self.root_notebook.child_notes.append(note_imf)
+
+        # Don't export empty notebooks
+        self.root_notebook.child_notebooks = [
+            nb for nb in self.root_notebook.child_notebooks if not nb.is_empty()
+        ]
diff --git a/src/importer.py b/src/importer.py
@@ -76,6 +76,10 @@ def safe_path(path: Path | str) -> Path | str:
     if safe_name in forbidden_names:
         safe_name += "_"
 
+    # Limit filename to 200 characters
+    # https://serverfault.com/a/9548
+    safe_name = safe_name[:200]
+
     return safe_name if isinstance(path, str) else path.with_name(safe_name)
 
 

diff --git a/test/data b/test/data
diff --git a/test/test_convert.py b/test/test_convert.py
@@ -77,6 +77,9 @@ def compare_dirs(dir1: Path, dir2: Path):
 
     @parameterized.expand(
         [
+            [["anki/test_1/MEILLEUR_DECK_ANGLAIS_3000.apkg"]],
+            [["anki/test_2/Ukrainian_Prepositions_pictsaudio_ENG-UA__UA-ENG.apkg"]],
+            [["anki/test_3/Hebrew_Alphabet_with_vowels.apkg"]],
             [["bear/test_1/backup.bear2bk"]],
             [["bear/test_2/backup-2.bear2bk"]],
             [["cacher/test_1/cacher-export-202406182304.json"]],