Skip to content

Commit

Permalink
add support for eml
Browse files Browse the repository at this point in the history
  • Loading branch information
marph91 committed Oct 12, 2024
1 parent efb1e93 commit 06c19b4
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/formats/default.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The default import covers the following formats:
- [Rich Text Format (RTF))](https://en.wikipedia.org/wiki/Rich_Text_Format)
- [txt2tags](https://txt2tags.org/)
- [asciidoc](https://docs.asciidoctor.org/asciidoc/latest/) (requires [asciidoctor](https://asciidoctor.org/) installed and in path)
- [Email (eml)](https://en.wikipedia.org/wiki/Email#Filename_extensions)
- [Fountain](https://fountain.io/):
- There is a [built-in Joplin plugin](https://joplinapp.org/help/apps/markdown/#markdown-plugins) that can be activated in the settings.
- There is a [Obsidian plugin](https://github.com/Darakah/obsidian-fountain).
Expand Down
5 changes: 5 additions & 0 deletions src/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import common
import intermediate_format as imf
import markdown_lib.common
import markdown_lib.eml


class BaseConverter(abc.ABC):
Expand Down Expand Up @@ -136,6 +137,10 @@ def convert_file(self, file_: Path, parent: imf.Notebook):
if note_body_splitted[-2].startswith("Last updated "):
# Remove unnecessarily added lines if needed.
note_body = "\n".join(note_body_splitted[:-2])
case ".eml":
note_imf = markdown_lib.eml.eml_to_note(file_, self.resource_folder)
parent.child_notes.append(note_imf)
return # don't use the common conversion
case ".fountain":
# Simply wrap in a code block. This is supported in
# Joplin and Obsidian via plugins.
Expand Down
108 changes: 108 additions & 0 deletions src/markdown_lib/eml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Convert an Email (.eml) to the intermediate format."""

import datetime as dt
import email
import email.policy
import logging
from pathlib import Path
import time

import common
import intermediate_format as imf
import markdown_lib.common


LOGGER = logging.getLogger("jimmy")


def decode_payload(part) -> str:
try:
return part.get_content()
except (LookupError, UnicodeDecodeError):
# try to work around invalid encodings by trying with "utf-8"
return part.get_payload(decode=True).decode("utf-8")


def handle_part(part, attachment_folder: Path) -> tuple[list[str], list[imf.Resource]]:
mime = part.get_content_type()
if mime == "text/html":
return [markdown_lib.common.markup_to_markdown(decode_payload(part))], []
if mime in ("text/markdown", "text/plain"):
return [decode_payload(part)], []
if any(mime.startswith(t) for t in ("audio/", "image/", "application/", "text/")):
id_ = part.get("Content-ID")
if id_ is not None:
# id seems to be enclosed by <> here, but by [] in the body
id_ = f"[cid:{id_[1:-1]}]"
unique_resource_path = attachment_folder / common.unique_title()
unique_resource_path.write_bytes(part.get_payload(decode=True))
resource = imf.Resource(
unique_resource_path, original_text=id_, title=part.get_filename()
)
return [], [resource]
LOGGER.debug(f"Unhandled mime type: {mime}")
return [], []


def parse_message(
message, attachment_folder: Path
) -> tuple[list[str], list[imf.Resource]]:
body = []
resources = []
if message.is_multipart():
mime = message.get_content_type()
payloads = message.get_payload()
if mime == "multipart/alternative":
# choose the best payload: text is easy to process
best_payload = message.get_body(preferencelist=("plain", "html"))
if best_payload is not None:
part_body, part_resources = handle_part(best_payload, attachment_folder)
body.extend(part_body)
resources.extend(part_resources)
else:
LOGGER.debug("failed to obtain body")
else:
# iterate over all available payloads
for payload in payloads:
part_body, part_resources = parse_message(payload, attachment_folder)
body.extend(part_body)
resources.extend(part_resources)
else:
part_body, part_resources = handle_part(message, attachment_folder)
body.extend(part_body)
resources.extend(part_resources)
return body, resources


def eml_to_note(file_: Path, attachment_folder: Path) -> imf.Note:
# decode the header by using the default policy
# https://stackoverflow.com/a/55210089/7410886
message = email.message_from_bytes(file_.read_bytes(), policy=email.policy.default)

# time_struct -> unix timestamp -> datetime
if (
message["Date"] is not None
and (parsed_date := email.utils.parsedate(message["Date"])) is not None
) or (
message["Received"] is not None
and (parsed_date := email.utils.parsedate(message["Received"].split("; ")[-1]))
is not None
):
date = dt.datetime.fromtimestamp(int(time.mktime(parsed_date)))
else:
LOGGER.debug("failed to obtain date")
date = None

body, resources = parse_message(message, attachment_folder)

note_imf = imf.Note(
# TODO: f"{0 if date is None else date.isoformat()}_{message["Subject"]}",
file_.stem,
"\n".join([f"# {message["Subject"]}", ""] + body),
source_application="jimmy",
resources=resources,
created=date,
updated=date,
author=message["From"],
)
return note_imf
2 changes: 1 addition & 1 deletion test/data
Submodule data updated 231 files
1 change: 1 addition & 0 deletions test/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ def test_formats(self, test_input):
["single_file", ["default_format/arbitrary_folder/plaintext.txt"]],
["multiple_files", ["default_format/arbitrary_folder/plaintext.txt"] * 2],
["markdown_file", ["default_format/arbitrary_folder/sample.md"]],
["eml", ["default_format/eml"]],
]
)
def test_default_format(self, test_name, test_input):
Expand Down

0 comments on commit 06c19b4

Please sign in to comment.