From 06c19b45c7542aa214f37116b7024b742f0210f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20D=C3=B6rfelt?= <martin.d@andix.de>
Date: Sat, 12 Oct 2024 14:26:18 +0200
Subject: [PATCH] add support for eml

---
 docs/formats/default.md |   1 +
 src/converter.py        |   5 ++
 src/markdown_lib/eml.py | 108 ++++++++++++++++++++++++++++++++++++++++
 test/data               |   2 +-
 test/test_convert.py    |   1 +
 5 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 src/markdown_lib/eml.py

diff --git a/docs/formats/default.md b/docs/formats/default.md
index 83214c3d..2e53eaed 100644
--- a/docs/formats/default.md
+++ b/docs/formats/default.md
@@ -18,6 +18,7 @@ The default import covers the following formats:
     - [Rich Text Format (RTF))](https://en.wikipedia.org/wiki/Rich_Text_Format)
     - [txt2tags](https://txt2tags.org/)
 - [asciidoc](https://docs.asciidoctor.org/asciidoc/latest/) (requires [asciidoctor](https://asciidoctor.org/) installed and in path)
+- [Email (eml)](https://en.wikipedia.org/wiki/Email#Filename_extensions)
 - [Fountain](https://fountain.io/):
     - There is a [built-in Joplin plugin](https://joplinapp.org/help/apps/markdown/#markdown-plugins) that can be activated in the settings.
     - There is a [Obsidian plugin](https://github.com/Darakah/obsidian-fountain).
diff --git a/src/converter.py b/src/converter.py
index 81145746..4917de49 100644
--- a/src/converter.py
+++ b/src/converter.py
@@ -8,6 +8,7 @@
 import common
 import intermediate_format as imf
 import markdown_lib.common
+import markdown_lib.eml
 
 
 class BaseConverter(abc.ABC):
@@ -136,6 +137,10 @@ def convert_file(self, file_: Path, parent: imf.Notebook):
                 if note_body_splitted[-2].startswith("Last updated "):
                     # Remove unnecessarily added lines if needed.
                     note_body = "\n".join(note_body_splitted[:-2])
+            case ".eml":
+                note_imf = markdown_lib.eml.eml_to_note(file_, self.resource_folder)
+                parent.child_notes.append(note_imf)
+                return  # don't use the common conversion
             case ".fountain":
                 # Simply wrap in a code block. This is supported in
                 # Joplin and Obsidian via plugins.
diff --git a/src/markdown_lib/eml.py b/src/markdown_lib/eml.py
new file mode 100644
index 00000000..c1a80491
--- /dev/null
+++ b/src/markdown_lib/eml.py
@@ -0,0 +1,108 @@
+"""Convert an Email (.eml) to the intermediate format."""
+
+import datetime as dt
+import email
+import email.policy
+import logging
+from pathlib import Path
+import time
+
+import common
+import intermediate_format as imf
+import markdown_lib.common
+
+
+LOGGER = logging.getLogger("jimmy")
+
+
+def decode_payload(part) -> str:
+    try:
+        return part.get_content()
+    except (LookupError, UnicodeDecodeError):
+        # try to work around invalid encodings by trying with "utf-8"
+        return part.get_payload(decode=True).decode("utf-8")
+
+
+def handle_part(part, attachment_folder: Path) -> tuple[list[str], list[imf.Resource]]:
+    mime = part.get_content_type()
+    if mime == "text/html":
+        return [markdown_lib.common.markup_to_markdown(decode_payload(part))], []
+    if mime in ("text/markdown", "text/plain"):
+        return [decode_payload(part)], []
+    if any(mime.startswith(t) for t in ("audio/", "image/", "application/", "text/")):
+        id_ = part.get("Content-ID")
+        if id_ is not None:
+            # id seems to be enclosed by <> here, but by [] in the body
+            id_ = f"[cid:{id_[1:-1]}]"
+        unique_resource_path = attachment_folder / common.unique_title()
+        unique_resource_path.write_bytes(part.get_payload(decode=True))
+        resource = imf.Resource(
+            unique_resource_path, original_text=id_, title=part.get_filename()
+        )
+        return [], [resource]
+    LOGGER.debug(f"Unhandled mime type: {mime}")
+    return [], []
+
+
+def parse_message(
+    message, attachment_folder: Path
+) -> tuple[list[str], list[imf.Resource]]:
+    body = []
+    resources = []
+    if message.is_multipart():
+        mime = message.get_content_type()
+        payloads = message.get_payload()
+        if mime == "multipart/alternative":
+            # choose the best payload: text is easy to process
+            best_payload = message.get_body(preferencelist=("plain", "html"))
+            if best_payload is not None:
+                part_body, part_resources = handle_part(best_payload, attachment_folder)
+                body.extend(part_body)
+                resources.extend(part_resources)
+            else:
+                LOGGER.debug("failed to obtain body")
+        else:
+            # iterate over all available payloads
+            for payload in payloads:
+                part_body, part_resources = parse_message(payload, attachment_folder)
+                body.extend(part_body)
+                resources.extend(part_resources)
+    else:
+        part_body, part_resources = handle_part(message, attachment_folder)
+        body.extend(part_body)
+        resources.extend(part_resources)
+    return body, resources
+
+
+def eml_to_note(file_: Path, attachment_folder: Path) -> imf.Note:
+    # decode the header by using the default policy
+    # https://stackoverflow.com/a/55210089/7410886
+    message = email.message_from_bytes(file_.read_bytes(), policy=email.policy.default)
+
+    # time_struct -> unix timestamp -> datetime
+    if (
+        message["Date"] is not None
+        and (parsed_date := email.utils.parsedate(message["Date"])) is not None
+    ) or (
+        message["Received"] is not None
+        and (parsed_date := email.utils.parsedate(message["Received"].split("; ")[-1]))
+        is not None
+    ):
+        date = dt.datetime.fromtimestamp(int(time.mktime(parsed_date)))
+    else:
+        LOGGER.debug("failed to obtain date")
+        date = None
+
+    body, resources = parse_message(message, attachment_folder)
+
+    note_imf = imf.Note(
+        # TODO: f"{0 if date is None else date.isoformat()}_{message["Subject"]}",
+        file_.stem,
+        "\n".join([f"# {message["Subject"]}", ""] + body),
+        source_application="jimmy",
+        resources=resources,
+        created=date,
+        updated=date,
+        author=message["From"],
+    )
+    return note_imf
diff --git a/test/data b/test/data
index c88dfcfb..2e7344f1 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit c88dfcfbbbc84b07664edab0e9ac0e6ab4732706
+Subproject commit 2e7344f1c0b09938e7911f28e194b0d42048c8eb
diff --git a/test/test_convert.py b/test/test_convert.py
index 1c9e1ee6..2a81a070 100644
--- a/test/test_convert.py
+++ b/test/test_convert.py
@@ -169,6 +169,7 @@ def test_formats(self, test_input):
             ["single_file", ["default_format/arbitrary_folder/plaintext.txt"]],
             ["multiple_files", ["default_format/arbitrary_folder/plaintext.txt"] * 2],
             ["markdown_file", ["default_format/arbitrary_folder/sample.md"]],
+            ["eml", ["default_format/eml"]],
         ]
     )
     def test_default_format(self, test_name, test_input):