Skip to content

Commit

Permalink
Merge pull request #2 from remarkablemark/client-parser-refactor
Browse files Browse the repository at this point in the history
Refactor and improve client parser
  • Loading branch information
remarkablemark authored Jun 17, 2017
2 parents 0787501 + c873b6e commit 23fcfcc
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 92 deletions.
161 changes: 161 additions & 0 deletions lib/domparser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
'use strict';

/**
* Constants.
*/
var HTML_TAG_NAME = 'html';
var BODY_TAG_NAME = 'body';
var HEAD_TAG_NAME = 'head';
var FIRST_TAG_REGEX = /<([a-zA-Z]+[0-9]?)/; // e.g., <h1>
var HEAD_REGEX = /<head[\s\S]*>[\s\S]*<\/head>/i;
var BODY_REGEX = /<body[\s\S]*>[\s\S]*<\/body>/i;

/**
* DOMParser (performance: slow).
*
* https://developer.mozilla.org/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document
*/
var parseFromString;
if (typeof window.DOMParser === 'function') {
var domParser = new window.DOMParser();
var MIME_TYPE = 'text/' + HTML_TAG_NAME;

/**
* Creates an HTML document using `DOMParser.parseFromString`.
*
* @param {String} html - The HTML string.
* @param {String} [tagName] - The element to render the HTML.
* @return {HTMLDocument}
*/
parseFromString = function domStringParser(html, tagName) {
if (tagName) {
html = ['<', tagName, '>', html, '</', tagName, '>'].join('');
}
return domParser.parseFromString(html, MIME_TYPE);
};
}

/**
* DOMImplementation (performance: fair).
*
* https://developer.mozilla.org/docs/Web/API/DOMImplementation/createHTMLDocument
*/
var parseFromDocument;
if (typeof document.implementation === 'object') {
var doc = document.implementation.createHTMLDocument();

/**
* Use HTML document created by `document.implementation.createHTMLDocument`.
*
* @param {String} html - The HTML string.
* @param {String} [tagName] - The element to render the HTML.
* @return {HTMLDocument}
*/
parseFromDocument = function createHTMLDocument(html, tagName) {
if (tagName) {
doc.documentElement.getElementsByTagName(tagName)[0].innerHTML = html;
} else {
doc.documentElement.innerHTML = html;
}
return doc;
};
}

/**
* Template (performance: fast).
*
* https://developer.mozilla.org/docs/Web/HTML/Element/template
*/
var parseFromTemplate;
var template = document.createElement('template');
if (template.content) {

/**
* Uses a template element (content fragment) to parse HTML.
*
* @param {String} html - The HTML string.
* @return {NodeList}
*/
parseFromTemplate = function templateParser(html) {
template.innerHTML = html;
return template.content.childNodes;
};
}

/** Fallback document parser. */
var parseWithFallback = parseFromDocument || parseFromString;

/**
* Parses HTML string to DOM nodes.
*
* @param {String} html - The HTML string.
* @param {String} [tagName] - The tag name.
* @return {NodeList|Array}
*/
module.exports = function domparser(html) {
// try to match first tag
var tagName;
var match = html.match(FIRST_TAG_REGEX);
if (match && match[1]) {
tagName = match[1];
}

var doc;
var element;
var elements;

switch (tagName) {
case HTML_TAG_NAME:
if (parseFromString) {
doc = parseFromString(html);

// strip elements if not found
if (!HEAD_REGEX.test(html)) {
element = doc.getElementsByTagName(HEAD_TAG_NAME)[0];
element.parentNode.removeChild(element);
}

if (!BODY_REGEX.test(html)) {
element = doc.getElementsByTagName(BODY_TAG_NAME)[0];
element.parentNode.removeChild(element);
}

return doc.getElementsByTagName(HTML_TAG_NAME);
}
break;

case HEAD_TAG_NAME:
if (parseWithFallback) {
elements = parseWithFallback(html).getElementsByTagName(HEAD_TAG_NAME);

// account for possibility of sibling
if (BODY_REGEX.test(html)) {
return elements[0].parentNode.childNodes;
}
return elements;
}
break;

case BODY_TAG_NAME:
if (parseWithFallback) {
elements = parseWithFallback(html).getElementsByTagName(BODY_TAG_NAME);

// account for possibility of sibling
if (HEAD_REGEX.test(html)) {
return elements[0].parentNode.childNodes;
}
return elements;
}
break;

// low-level tag or text
default:
if (parseFromTemplate) return parseFromTemplate(html);
if (parseWithFallback) {
return parseWithFallback(html, BODY_TAG_NAME).getElementsByTagName(BODY_TAG_NAME)[0].childNodes;
}
break;
}

return [];
};
101 changes: 18 additions & 83 deletions lib/html-to-dom-client.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,98 +3,33 @@
/**
* Module dependencies.
*/
var domparser = require('./domparser');
var utilities = require('./utilities');
var formatDOM = utilities.formatDOM;

/**
* Parse HTML string to DOM nodes.
* This uses the browser DOM API.
* Constants.
*/
var DIRECTIVE_REGEX = /<(![a-zA-Z\s]+)>/; // e.g., <!doctype html>

/**
* Parses HTML and reformats DOM nodes output.
*
* @param {String} html - The HTML.
* @return {Object} - The DOM nodes.
* @param {String} html - The HTML string.
* @return {Array} - The formatted DOM nodes.
*/
function parseDOM(html) {
module.exports = function parseDOM(html) {
if (typeof html !== 'string') {
throw new TypeError('First argument must be a string.');
}
if (!html) return [];

// try to match the tags
var match = html.match(/<[^\/](.+?)>/g);
var nodes;

if (match && match.length) {
var tagMatch = match[0];

// directive matched
if (/<![^-]/.test(tagMatch)) {
var directive = (
// remove angle brackets
tagMatch
.substring(1, tagMatch.length - 1)
.trim()
);

// tag name can no longer be first match item
tagMatch = match[1];

// remove directive from html
html = html.substring(html.indexOf('>') + 1);
}

// first tag name matched
if (tagMatch) {
var tagName = (
// keep only tag name
tagMatch
.substring(1, tagMatch.indexOf(' '))
.trim()
.toLowerCase()
)
}
// directive found
var match = html.match(DIRECTIVE_REGEX);
var directive;
if (match && match[1]) {
directive = match[1];
}

// create html document to parse top-level nodes
if (['html', 'head', 'body'].indexOf(tagName) > -1) {
var doc;

// `new DOMParser().parseFromString()`
// https://developer.mozilla.org/en-US/docs/Web/API/DOMParser#Parsing_an_SVG_or_HTML_document
if (window.DOMParser) {
doc = new window.DOMParser().parseFromString(html, 'text/html');

// `DOMImplementation.createHTMLDocument()`
// https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument
} else if (document.implementation.createHTMLDocument) {
doc = document.implementation.createHTMLDocument();
doc.documentElement.innerHTML = html;
doc.removeChild(doc.childNodes[0]); // remove doctype
}

// html
if (tagName === 'html') {
nodes = doc.childNodes;
// head and body
} else {
nodes = (
// do this so attributes are kept
// but there may be an extra head/body node
doc.getElementsByTagName(tagName)[0]
.parentNode
.childNodes
);
}

// `innerHTML` approach
} else {
var container = document.createElement('body');
container.innerHTML = html;
nodes = container.childNodes;
}

return formatDOM(nodes, null, directive);
}

/**
* Export HTML to DOM parser (client).
*/
module.exports = parseDOM;
return formatDOM(domparser(html), null, directive);
};
13 changes: 4 additions & 9 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,9 @@ describe('html-dom-parser', function() {

// client
describe('client parser', function() {
var parser = require('../lib/html-to-dom-client');
var jsdomify = require('jsdomify').default;

before(function() {
jsdomify.create();
});

after(function() {
jsdomify.destroy();
});
jsdomify.create();
var parser = require('../lib/html-to-dom-client');

// check if invalid parameter type throws error
throwTests(parser);
Expand All @@ -72,6 +65,8 @@ describe('html-dom-parser', function() {
runTests(parser, fixtures.html);
// svg does not work in jsdom
// runTests(parser, fixtures.svg);

jsdomify.destroy();
});

});

0 comments on commit 23fcfcc

Please sign in to comment.