diff --git a/README.rst b/README.rst index 1c71c690..c96c9f98 100644 --- a/README.rst +++ b/README.rst @@ -393,6 +393,8 @@ JSON-LD extraction 'jobTitle': 'Graduate research assistant', 'name': 'John Doe', 'url': 'http://www.example.com'}] + >>> # raw json output is also possible: + >>> raw_json = jslde.extract(parse_json=False) RDFa extraction (experimental) @@ -437,6 +439,8 @@ RDFa extraction (experimental) 'photos than I do:\n' ' '}], 'http://schema.org/creator': [{'@id': 'http://www.example.com/index.html#me'}]}] + >>> # raw json output is also possible: + >>> raw_json = rdfae.extract(parse_json=False) You'll get a list of expanded JSON-LD nodes. diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f11580eb..62268fa2 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -16,26 +16,30 @@ class JsonLdExtractor(object): _xp_jsonld = lxml.etree.XPath('descendant-or-self::script[@type="application/ld+json"]') - def extract(self, htmlstring, base_url=None, encoding="UTF-8"): + def extract(self, htmlstring, base_url=None, encoding="UTF-8", parse_json=True): tree = parse_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url=base_url) + return self.extract_items(tree, base_url=base_url, parse_json=parse_json) - def extract_items(self, document, base_url=None): + def extract_items(self, document, base_url=None, parse_json=True): + if not parse_json: + return [self._extract_items_raw(item) for item in self._xp_jsonld(document)] return [ item for items in map(self._extract_items, self._xp_jsonld(document)) if items for item in items if item ] + def _extract_items_raw(self, node): + return node.xpath('string()') + def _extract_items(self, node): - script = node.xpath('string()') + script = self._extract_items_raw(node) try: # TODO: `strict=False` can be configurable if needed data = json.loads(script, strict=False) except ValueError: # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads( - HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + data = json.loads(HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index e5ab06bd..81c7bc41 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -30,11 +30,11 @@ class RDFaExtractor(object): def extract(self, htmlstring, base_url=None, encoding="UTF-8", - expanded=True): + expanded=True, parse_json=True): tree = parse_xmldom_html(htmlstring, encoding=encoding) - return self.extract_items(tree, base_url=base_url, expanded=expanded) + return self.extract_items(tree, base_url=base_url, expanded=expanded, parse_json=parse_json) - def extract_items(self, document, base_url=None, expanded=True): + def extract_items(self, document, base_url=None, expanded=True, parse_json=True): options = Options(output_processor_graph=True, embedded_rdf=False, space_preserve=True, @@ -46,4 +46,6 @@ def extract_items(self, document, base_url=None, expanded=True): g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) + if parse_json: + return json.loads(jsonld_string) + return jsonld_string diff --git a/tests/samples/misc/Portfolio_Niels_Lubberman.json b/tests/samples/misc/Portfolio_Niels_Lubberman.json index fb4364fd..490268e0 100644 --- a/tests/samples/misc/Portfolio_Niels_Lubberman.json +++ b/tests/samples/misc/Portfolio_Niels_Lubberman.json @@ -1,42 +1,24 @@ [ { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc" - }, - { - "http://www.w3.org/1999/xhtml/vocab#icon": [ - { - "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico" - } - ], + "@id": "http://nielslubberman.nl/drupal/", "http://purl.org/rss/1.0/modules/content/encoded": [ { - "@value": "
Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.
\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "Voeg mij nu toe aan uw professionele netwerk op LinkedIn.
\n\n" }, { - "@value": "Voeg mij nu toe aan uw professionele netwerk op LinkedIn.
\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "Op deze vernieuwde website kunt u enkele van mijn projecten vinden, tevens kunt u lessen downloaden die ik heb gemaakt.
\n\n" }, { - "@value": "Met behulp van de pijl hieronder kunt u mijn CV downloaden.
\n\n", - "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral" + "@type": "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral", + "@value": "Met behulp van de pijl hieronder kunt u mijn CV downloaden.
\n\n" } ], "http://www.w3.org/1999/xhtml/vocab#alternate": [ @@ -44,63 +26,112 @@ "@id": "http://nielslubberman.nl/drupal/?q=rss.xml" } ], - "@id": "http://nielslubberman.nl/drupal/" + "http://www.w3.org/1999/xhtml/vocab#icon": [ + { + "@id": "http://nielslubberman.nl/drupal/misc/favicon.ico" + } + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_download.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_jobs.png?itok=QUE2ZKFT&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_turing.png?itok=anlTc5N6&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_roemenie.png?itok=pScuIyeN&sc=363eea0a2ddd62c554241fc1fed1f3bc", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_kanaal.jpg?itok=cV8u1cxa&sc=b496d2d94d76a056d4e6efde1cfb2abe", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_gates.png?itok=sIwGJEG3&sc=259cbe26e2b9c2489443d05fdcd3f824" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/1979%20Nicolae%20si%20Nicu%20Ceausescu%20la%20Canal.JPG?itok=CYcBbx1w&sc=3e5afd5e3e8746f6db8fb6f52a325372", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_poster_tim_berners_lee.png?itok=DghJBlqt&sc=259cbe26e2b9c2489443d05fdcd3f824", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_2.png" + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/headlines/headline_linkedin.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] }, { - "http://www.w3.org/2004/02/skos/core#prefLabel": [ + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_1.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_3.png", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_ru_secure.png?itok=2-xDWirb&sc=67c0f518676aaf034a9215a0ec7e9e1e", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/sites/default/files/styles/smallcrop/public/projects/pictures/pro_asppolgs_final.png?itok=apZpSYdS&sc=7ade4b4c9baeea7a86bad48589f9649d", + "@type": [ + "http://xmlns.com/foaf/0.1/Image" + ] + }, + { + "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1", + "@type": [ + "http://www.w3.org/2004/02/skos/core#Concept" + ], + "http://www.w3.org/2000/01/rdf-schema#label": [ { "@language": "en", "@value": "Geschiedenis" } ], - "http://www.w3.org/2000/01/rdf-schema#label": [ + "http://www.w3.org/2004/02/skos/core#prefLabel": [ { "@language": "en", "@value": "Geschiedenis" } - ], - "@type": ["http://www.w3.org/2004/02/skos/core#Concept"], - "@id": "http://nielslubberman.nl/drupal/?q=taxonomy/term/1" - }, - { - "@type": ["http://xmlns.com/foaf/0.1/Image"], - "@id": "http://nielslubberman.nl/drupal/sites/default/files/images/projects/grey_img_4.png" + ] } ] - diff --git a/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld b/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld new file mode 100644 index 00000000..bb0ef0dd --- /dev/null +++ b/tests/samples/songkick/Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015.raw.jsonld @@ -0,0 +1 @@ +[{"@context":"http://schema.org","@type":"MusicEvent","name":"Elysian Fields","url":"http://www.songkick.com/concerts/25248299-elysian-fields-at-owl-music-parlor?utm_medium=organic\u0026utm_source=microformat","location":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Brooklyn","addressCountry":"US","addressRegion":"NY","streetAddress":"497 Rogers Ave","postalCode":"11225"},"name":"The Owl Music Parlor","sameAs":"http://www.theowl.nyc","geo":{"@type":"GeoCoordinates","latitude":40.660109,"longitude":-73.953193}},"startDate":"2015-10-31T19:30:00-0400","performer":[{"@type":"MusicGroup","name":"Elysian Fields","sameAs":"http://www.songkick.com/artists/236156-elysian-fields?utm_medium=organic\u0026utm_source=microformat"}]}] \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 47309ee9..63699908 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -41,6 +41,13 @@ def test_jsonld_with_control_characters_comment(self): folder='custom.invalid', page='JSONLD_with_control_characters_comment') + def test_jsonld_raw_json(self): + folder = 'songkick' + page = 'Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015' + body = get_testdata(folder, '{}.html'.format(page)) + expected = get_testdata(folder, '{}.raw.jsonld'.format(page)).decode('utf8') + self._check_jsonld(body, [expected], parse_json=False) + def assertJsonLdCorrect(self, folder, page): body, expected = self._get_body_expected(folder, page) self._check_jsonld(body, expected) @@ -50,7 +57,7 @@ def _get_body_expected(self, folder, page): expected = get_testdata(folder, '{}.jsonld'.format(page)) return body, json.loads(expected.decode('utf8')) - def _check_jsonld(self, body, expected): + def _check_jsonld(self, body, expected, **extract_kwargs): jsonlde = JsonLdExtractor() - data = jsonlde.extract(body) + data = jsonlde.extract(body, **extract_kwargs) self.assertEqual(data, expected) diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py index 98da33d2..59f25232 100644 --- a/tests/test_rdfa.py +++ b/tests/test_rdfa.py @@ -90,6 +90,19 @@ def test_wikipedia_xhtml_rdfa(self): self.assertJsonLDEqual(data, expected) + def test_wikipedia_xhtml_rdfa_raw(self): + """ + test wether raw json is extracted properly + using parse_json=False keyword argument for the extraction method + """ + fileprefix = 'xhtml+rdfa' + body = get_testdata('wikipedia', fileprefix + '.html') + expected = get_testdata('wikipedia', fileprefix + '.expanded.json').decode('UTF-8').strip() + data = RDFaExtractor().extract( + body, base_url='http://www.example.com/index.html', parse_json=False + ).strip() + self.assertEquals(self.normalize_bnode_ids(data), self.normalize_bnode_ids(expected)) + def test_wikipedia_xhtml_rdfa_no_prefix(self): body = get_testdata('misc', 'Portfolio_Niels_Lubberman.html') expected = json.loads( @@ -98,5 +111,4 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self): rdfae = RDFaExtractor() data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/') - self.assertJsonLDEqual(data, expected)