Skip to content

Commit

Permalink
Strip empty prop and content tags
Browse files Browse the repository at this point in the history
Updated the tests
  • Loading branch information
Cristi Constantin committed Jul 17, 2019
1 parent 8e69411 commit d4645ef
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 6 deletions.
4 changes: 2 additions & 2 deletions extruct/opengraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def extract_items(self, document, base_url=None):
namespaces.update(self.get_namespaces(head))
props = []
for el in head.xpath('meta[@property and @content]'):
prop = el.attrib['property']
val = el.attrib['content']
prop = el.attrib['property'].strip()
val = el.attrib['content'].strip()
if prop == '' or val == '':
continue
ns = prop.partition(':')[0]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ requests
rdflib
rdflib-jsonld
mf2py>=1.1.0
six
six>=1.11
w3lib
1 change: 1 addition & 0 deletions tests/samples/songkick/elysianfields.html
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
<meta property="og:site_name" content="Songkick">
<meta property="og:type" content="songkick-concerts:artist">
<meta property="og:title" content="Elysian Fields">
<meta property="og:title" content=" ">
<meta property="og:description" content="Buy tickets for an upcoming Elysian Fields concert near you. List of all Elysian Fields tickets and tour dates for 2017.">
<meta property="og:description" content="" />
<meta property="og:url" content="http://www.songkick.com/artists/236156-elysian-fields">
Expand Down
3 changes: 3 additions & 0 deletions tests/samples/songkick/elysianfields.json
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@
"http://ogp.me/ns#title": [
{
"@value": "Elysian Fields"
},
{
"@value": " "
}
],
"http://ogp.me/ns#type": [
Expand Down
10 changes: 7 additions & 3 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ def test_all(self):
body = get_testdata('songkick', 'elysianfields.html')
expected = json.loads(get_testdata('songkick', 'elysianfields.json').decode('UTF-8'))
data = extruct.extract(body, base_url='http://www.songkick.com/artists/236156-elysian-fields')
# See test_rdfa_not_preserving_order()
del data['rdfa'][0]['http://ogp.me/ns#image']
del expected['rdfa'][0]['http://ogp.me/ns#image']
# Sorting the values here because RDFa is not preserving ordering on duplicated properties.
# See https://github.com/scrapinghub/extruct/issues/116
# Also see test_rdfa_not_preserving_order()
for rdf in data['rdfa']:
for key, pairs in rdf.items():
if ':' in key and isinstance(pairs, list):
rdf[key] = sorted(pairs, key=lambda e: e["@value"], reverse=True)
self.assertEqual(jsonize_dict(data), expected)

@pytest.mark.xfail
Expand Down

0 comments on commit d4645ef

Please sign in to comment.