From 06c19b45c7542aa214f37116b7024b742f0210f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20D=C3=B6rfelt?= Date: Sat, 12 Oct 2024 14:26:18 +0200 Subject: [PATCH] add support for eml --- docs/formats/default.md | 1 + src/converter.py | 5 ++ src/markdown_lib/eml.py | 108 ++++++++++++++++++++++++++++++++++++++++ test/data | 2 +- test/test_convert.py | 1 + 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 src/markdown_lib/eml.py diff --git a/docs/formats/default.md b/docs/formats/default.md index 83214c3d..2e53eaed 100644 --- a/docs/formats/default.md +++ b/docs/formats/default.md @@ -18,6 +18,7 @@ The default import covers the following formats: - [Rich Text Format (RTF))](https://en.wikipedia.org/wiki/Rich_Text_Format) - [txt2tags](https://txt2tags.org/) - [asciidoc](https://docs.asciidoctor.org/asciidoc/latest/) (requires [asciidoctor](https://asciidoctor.org/) installed and in path) +- [Email (eml)](https://en.wikipedia.org/wiki/Email#Filename_extensions) - [Fountain](https://fountain.io/): - There is a [built-in Joplin plugin](https://joplinapp.org/help/apps/markdown/#markdown-plugins) that can be activated in the settings. - There is a [Obsidian plugin](https://github.com/Darakah/obsidian-fountain). diff --git a/src/converter.py b/src/converter.py index 81145746..4917de49 100644 --- a/src/converter.py +++ b/src/converter.py @@ -8,6 +8,7 @@ import common import intermediate_format as imf import markdown_lib.common +import markdown_lib.eml class BaseConverter(abc.ABC): @@ -136,6 +137,10 @@ def convert_file(self, file_: Path, parent: imf.Notebook): if note_body_splitted[-2].startswith("Last updated "): # Remove unnecessarily added lines if needed. note_body = "\n".join(note_body_splitted[:-2]) + case ".eml": + note_imf = markdown_lib.eml.eml_to_note(file_, self.resource_folder) + parent.child_notes.append(note_imf) + return # don't use the common conversion case ".fountain": # Simply wrap in a code block. This is supported in # Joplin and Obsidian via plugins. diff --git a/src/markdown_lib/eml.py b/src/markdown_lib/eml.py new file mode 100644 index 00000000..c1a80491 --- /dev/null +++ b/src/markdown_lib/eml.py @@ -0,0 +1,108 @@ +"""Convert an Email (.eml) to the intermediate format.""" + +import datetime as dt +import email +import email.policy +import logging +from pathlib import Path +import time + +import common +import intermediate_format as imf +import markdown_lib.common + + +LOGGER = logging.getLogger("jimmy") + + +def decode_payload(part) -> str: + try: + return part.get_content() + except (LookupError, UnicodeDecodeError): + # try to work around invalid encodings by trying with "utf-8" + return part.get_payload(decode=True).decode("utf-8") + + +def handle_part(part, attachment_folder: Path) -> tuple[list[str], list[imf.Resource]]: + mime = part.get_content_type() + if mime == "text/html": + return [markdown_lib.common.markup_to_markdown(decode_payload(part))], [] + if mime in ("text/markdown", "text/plain"): + return [decode_payload(part)], [] + if any(mime.startswith(t) for t in ("audio/", "image/", "application/", "text/")): + id_ = part.get("Content-ID") + if id_ is not None: + # id seems to be enclosed by <> here, but by [] in the body + id_ = f"[cid:{id_[1:-1]}]" + unique_resource_path = attachment_folder / common.unique_title() + unique_resource_path.write_bytes(part.get_payload(decode=True)) + resource = imf.Resource( + unique_resource_path, original_text=id_, title=part.get_filename() + ) + return [], [resource] + LOGGER.debug(f"Unhandled mime type: {mime}") + return [], [] + + +def parse_message( + message, attachment_folder: Path +) -> tuple[list[str], list[imf.Resource]]: + body = [] + resources = [] + if message.is_multipart(): + mime = message.get_content_type() + payloads = message.get_payload() + if mime == "multipart/alternative": + # choose the best payload: text is easy to process + best_payload = message.get_body(preferencelist=("plain", "html")) + if best_payload is not None: + part_body, part_resources = handle_part(best_payload, attachment_folder) + body.extend(part_body) + resources.extend(part_resources) + else: + LOGGER.debug("failed to obtain body") + else: + # iterate over all available payloads + for payload in payloads: + part_body, part_resources = parse_message(payload, attachment_folder) + body.extend(part_body) + resources.extend(part_resources) + else: + part_body, part_resources = handle_part(message, attachment_folder) + body.extend(part_body) + resources.extend(part_resources) + return body, resources + + +def eml_to_note(file_: Path, attachment_folder: Path) -> imf.Note: + # decode the header by using the default policy + # https://stackoverflow.com/a/55210089/7410886 + message = email.message_from_bytes(file_.read_bytes(), policy=email.policy.default) + + # time_struct -> unix timestamp -> datetime + if ( + message["Date"] is not None + and (parsed_date := email.utils.parsedate(message["Date"])) is not None + ) or ( + message["Received"] is not None + and (parsed_date := email.utils.parsedate(message["Received"].split("; ")[-1])) + is not None + ): + date = dt.datetime.fromtimestamp(int(time.mktime(parsed_date))) + else: + LOGGER.debug("failed to obtain date") + date = None + + body, resources = parse_message(message, attachment_folder) + + note_imf = imf.Note( + # TODO: f"{0 if date is None else date.isoformat()}_{message["Subject"]}", + file_.stem, + "\n".join([f"# {message["Subject"]}", ""] + body), + source_application="jimmy", + resources=resources, + created=date, + updated=date, + author=message["From"], + ) + return note_imf diff --git a/test/data b/test/data index c88dfcfb..2e7344f1 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit c88dfcfbbbc84b07664edab0e9ac0e6ab4732706 +Subproject commit 2e7344f1c0b09938e7911f28e194b0d42048c8eb diff --git a/test/test_convert.py b/test/test_convert.py index 1c9e1ee6..2a81a070 100644 --- a/test/test_convert.py +++ b/test/test_convert.py @@ -169,6 +169,7 @@ def test_formats(self, test_input): ["single_file", ["default_format/arbitrary_folder/plaintext.txt"]], ["multiple_files", ["default_format/arbitrary_folder/plaintext.txt"] * 2], ["markdown_file", ["default_format/arbitrary_folder/sample.md"]], + ["eml", ["default_format/eml"]], ] ) def test_default_format(self, test_name, test_input):