Skip to content

Commit

Permalink
made load file contents more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
john-friedman committed Oct 29, 2024
1 parent 03ad865 commit b017d58
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 23 deletions.
64 changes: 55 additions & 9 deletions datamule/build/lib/datamule/parser/helper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,66 @@
from selectolax.parser import HTMLParser
from pathlib import Path


def load_file_content(filename):
# ~ 30ms per file
def load_text_content(filename):
with open(filename) as f:
return f.read().translate(str.maketrans({
'\xa0': ' ', '\u2003': ' ',
'\u2018': "'", '\u2019': "'",
'\u201c': '"', '\u201d': '"'
}))

def load_html_content(filename):
parser = HTMLParser(open(filename).read())
text = '\n'.join(
node.text_content.strip()
for node in parser.root.traverse(include_text=True)
if node.text_content and node.text_content.strip()
)

# Remove hidden elements first
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
for node in hidden_nodes:
node.decompose()

blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
lines = []
current_line = []

def flush_line():
if current_line:
lines.append(' '.join(current_line))
current_line.clear()

for node in parser.root.traverse(include_text=True):
if node.tag in ('script', 'style', 'css'):
continue

if node.tag in blocks:
flush_line()
lines.append('')

if node.text_content:
text = node.text_content.strip()
if text:
if node.tag in blocks:
flush_line()
lines.append(text)
lines.append('')
else:
current_line.append(text)

flush_line()

text = '\n'.join(lines)
while '\n\n\n' in text:
text = text.replace('\n\n\n', '\n\n')

return text.translate(str.maketrans({
'\xa0': ' ', '\u2003': ' ',
'\u2018': "'", '\u2019': "'",
'\u201c': '"', '\u201d': '"'
}))
def load_file_content(filename):
if filename.endswith('.txt'):
return load_text_content(filename)
elif filename.endswith('.html') or filename.endswith('.htm'):
return load_html_content(filename)
else:
raise ValueError(f"Unsupported file type: {filename}")

def clean_title(title: str) -> str:
"""Clean up section title by removing newlines, periods, and all whitespace, converting to lowercase."""
Expand Down
2 changes: 1 addition & 1 deletion datamule/datamule.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: datamule
Version: 0.369
Version: 0.371
Summary: Making it easier to use SEC filings.
Home-page: https://github.com/john-friedman/datamule-python
Author: John Friedman
Expand Down
6 changes: 3 additions & 3 deletions datamule/datamule.egg-info/requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ setuptools
selectolax

[all]
flask
google-generativeai
psutil
lxml
psutil
flask
openai
google-generativeai
pandas

[dataset_builder]
Expand Down
64 changes: 55 additions & 9 deletions datamule/datamule/parser/helper.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,66 @@
from selectolax.parser import HTMLParser
from pathlib import Path


def load_file_content(filename):
# ~ 30ms per file
def load_text_content(filename):
with open(filename) as f:
return f.read().translate(str.maketrans({
'\xa0': ' ', '\u2003': ' ',
'\u2018': "'", '\u2019': "'",
'\u201c': '"', '\u201d': '"'
}))

def load_html_content(filename):
parser = HTMLParser(open(filename).read())
text = '\n'.join(
node.text_content.strip()
for node in parser.root.traverse(include_text=True)
if node.text_content and node.text_content.strip()
)

# Remove hidden elements first
hidden_nodes = parser.css('[style*="display: none"], [style*="display:none"], .hidden, .hide, .d-none')
for node in hidden_nodes:
node.decompose()

blocks = {'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'article', 'section', 'li', 'td'}
lines = []
current_line = []

def flush_line():
if current_line:
lines.append(' '.join(current_line))
current_line.clear()

for node in parser.root.traverse(include_text=True):
if node.tag in ('script', 'style', 'css'):
continue

if node.tag in blocks:
flush_line()
lines.append('')

if node.text_content:
text = node.text_content.strip()
if text:
if node.tag in blocks:
flush_line()
lines.append(text)
lines.append('')
else:
current_line.append(text)

flush_line()

text = '\n'.join(lines)
while '\n\n\n' in text:
text = text.replace('\n\n\n', '\n\n')

return text.translate(str.maketrans({
'\xa0': ' ', '\u2003': ' ',
'\u2018': "'", '\u2019': "'",
'\u201c': '"', '\u201d': '"'
}))
def load_file_content(filename):
if filename.endswith('.txt'):
return load_text_content(filename)
elif filename.endswith('.html') or filename.endswith('.htm'):
return load_html_content(filename)
else:
raise ValueError(f"Unsupported file type: {filename}")

def clean_title(title: str) -> str:
"""Clean up section title by removing newlines, periods, and all whitespace, converting to lowercase."""
Expand Down
Binary file removed datamule/dist/datamule-0.369.tar.gz
Binary file not shown.
Binary file not shown.
Binary file added datamule/dist/datamule-0.371.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion datamule/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
setup(
name="datamule",
author="John Friedman",
version="0.369",
version="0.371",
description="Making it easier to use SEC filings.",
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down

0 comments on commit b017d58

Please sign in to comment.