forked from timbertson/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 352
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Bumped to version 0.6
- Loading branch information
Showing
9 changed files
with
223 additions
and
173 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
def open_in_browser(html): | ||
""" | ||
Open the HTML document in a web browser, saving it to a temporary | ||
file to open it. Note that this does not delete the file after | ||
use. This is mainly meant for debugging. | ||
""" | ||
import os | ||
import webbrowser | ||
import tempfile | ||
handle, fn = tempfile.mkstemp(suffix='.html') | ||
f = os.fdopen(handle, 'wb') | ||
try: | ||
f.write(b"<meta charset='UTF-8' />") | ||
f.write(html.encode('utf-8')) | ||
finally: | ||
# we leak the file itself here, but we should at least close it | ||
f.close() | ||
url = 'file://' + fn.replace(os.path.sep, '/') | ||
webbrowser.open(url) | ||
return url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,53 @@ | ||
def save_to_file(text, filename): | ||
f = open(filename, 'wt') | ||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') | ||
f.write(text.encode('utf-8')) | ||
f.close() | ||
import re | ||
|
||
uids = {} | ||
def describe(node, depth=2): | ||
|
||
#FIXME: use with caution, can leak memory | ||
uids = {} | ||
uids_document = None | ||
|
||
|
||
def describe_node(node): | ||
global uids | ||
if node is None: | ||
return '' | ||
if not hasattr(node, 'tag'): | ||
return "[%s]" % type(node) | ||
name = node.tag | ||
if node.get('id', ''): name += '#'+node.get('id') | ||
if node.get('class', ''): | ||
name += '.' + node.get('class').replace(' ','.') | ||
if node.get('id', ''): | ||
name += '#' + node.get('id') | ||
if node.get('class', ''): | ||
name += '.' + node.get('class').replace(' ', '.') | ||
if name[:4] in ['div#', 'div.']: | ||
name = name[3:] | ||
if name in ['tr', 'td', 'div', 'p']: | ||
if not node in uids: | ||
uid = uids[node] = len(uids)+1 | ||
else: | ||
uid = uids.get(node) | ||
name += "%02d" % (uid) | ||
if depth and node.getparent() is not None: | ||
return name+' - '+describe(node.getparent(), depth-1) | ||
uid = uids.get(node) | ||
if uid is None: | ||
uid = uids[node] = len(uids) + 1 | ||
name += "{%02d}" % uid | ||
return name | ||
|
||
|
||
def describe(node, depth=2): | ||
global uids, uids_document | ||
doc = node.getroottree().getroot() | ||
if doc != uids_document: | ||
uids = {} | ||
uids_document = doc | ||
|
||
#return repr(NodeRepr(node)) | ||
parent = '' | ||
if depth and node.getparent() is not None: | ||
parent = describe(node.getparent(), depth=depth - 1) | ||
return parent + '/' + describe_node(node) | ||
|
||
|
||
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U) | ||
|
||
|
||
def text_content(elem, length=40): | ||
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', '')) | ||
if len(content) < length: | ||
return content | ||
return content[:length] + '...' | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.