Skip to content

Commit

Permalink
Blog import (#47)
Browse files Browse the repository at this point in the history
* 1st vers of blog article import with basic cleanup

* Added tags to meta table, removed inline metadata

* Blog import

* Fixing tags

* Fixing metadata

* Making image more stable

* Trying to have image be correct in the index

* Adding the CA urls

* Using Image from meta if it exists

* Removed page edited manually

---------

Co-authored-by: Chris Bohnert <[email protected]>
  • Loading branch information
fe-lix- and bohnertchris authored Nov 30, 2023
1 parent d61355f commit 9388c20
Show file tree
Hide file tree
Showing 6 changed files with 511 additions and 8 deletions.
461 changes: 461 additions & 0 deletions tools/blog-us.sorted.txt

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions tools/importer/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,12 @@ export default {
'div.cmp-page__skiptomaincontent',
'div#mainContent',
'div.page-header',
// Remove navigation from the beginning of blog entries as well as
// readmore-type teasers and blurb about 24petwatch at the end
// Remove navigation from the beginning of blog entries as well as readmore-type
// teasers and blurb about 24petwatch at the end
'nav',
'div.imagelist',
'div.cmp-experiencefragment--blog-page-cta-component',
'div.cmp-layout-manual-articles',
]);

// create the metadata block and append it to the main element
Expand Down
7 changes: 7 additions & 0 deletions tools/importer/transformers/blogArticle.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ function createBlogArticle(main, document) {
const dds = document.querySelectorAll('dd');
if (dts) {
for (let i = 0; i < dts.length; i += 1) {
if (dts[i].textContent === 'Byline') {
const span = document.createElement('em');
const authorText = dds[i].textContent.trim();
span.textContent = authorText;
dts[i].closest('article').appendChild(span);
}

if (dts[i].textContent === 'Text') {
const div = document.createElement('div');
div.innerHTML = dds[i].innerHTML;
Expand Down
4 changes: 2 additions & 2 deletions tools/importer/transformers/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import createFooter from './footer.js';
import createFullLayoutSection from './fullLayoutSection.js';
import createHomepage from './homepage.js';
import createHeader from './header.js';
// import createHero from './hero.js';
import createHero from './hero.js';
import createMetadata from './metadata.js';
import createBold from './bold.js';
import blogBanner from './blogBanner.js';
Expand Down Expand Up @@ -36,5 +36,5 @@ export const preTransformers = [

export const postTransformers = [
createMetadata,
cleanBlog,
cleanBlog
];
20 changes: 20 additions & 0 deletions tools/importer/transformers/makeProxySrcs.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
function makeProxySrcs(main, document, host = 'https://www.24petwatch.com') => {
main.querySelectorAll('img').forEach((img) => {
if (img.src.startsWith('//')) {
img.src = `https:${img.src}`;
} else if (img.src.startsWith('/')) {
// make absolute
const cu = new URL(host);
img.src = `${cu.origin}${img.src}`;
}
try {
const u = new URL(img.src);
u.searchParams.append('host', u.origin);
img.src = `http://localhost:3001${u.pathname}${u.search}`;
} catch (error) {
console.warn(`Unable to make proxy src for ${img.src}: ${error.message}`);
}
});
};

export default makeProxySrcs;
22 changes: 18 additions & 4 deletions tools/importer/transformers/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,17 @@ const createMetadata = (main, document) => {
meta.Description = desc.content;
}

const img = document.querySelector('[property="og:image"]');
if (img && img.content) {
const img = document.querySelector('body img:first-child');
if (img && img.src) {
const el = document.createElement('img');
el.src = img.content;
el.src = img.src.replace('https://www.24petwatch.com', '');
meta.Image = el;
}

const metaImage = document.querySelector('[property="og:image"]');
if (metaImage) {
const el = document.createElement('img');
el.src = metaImage.content.replace('https://www.24petwatch.com', '');
meta.Image = el;
}

Expand All @@ -29,7 +36,9 @@ const createMetadata = (main, document) => {
const blogTags = document.querySelectorAll('div.cmp-contentfragment__element--tag > dd.cmp-contentfragment__element-value');
if (blogTags) {
for (let i = 0; i < blogTags.length; i += 1) {
meta.Tags = blogTags[i].innerHTML.replace('<br>', ' ');
meta.Tags = blogTags[i].innerHTML.trim()
.replaceAll('24petwatch:newletter/topic/', '')
.replaceAll('<br>', ',');
}
}

Expand All @@ -44,6 +53,11 @@ const createMetadata = (main, document) => {
}
}

const author = document.querySelector('div.cmp-contentfragment__element--byline dd');
if (author) {
meta.Author = author.textContent.trim().replace(/^By /, '');
}

const block = WebImporter.Blocks.getMetadataBlock(document, meta);
main.append(block);

Expand Down

0 comments on commit 9388c20

Please sign in to comment.