Complete website build including: - Build Your Kit store page with cart system, sectioned layout (Hardware, Software, Attachments, Spare Parts), inline quote request form, and sticky sidebar summary - 16+ pages: Education, Platform, Resources, News, About Us, Download, Contact, Rover, Code Editor, Robot Simulator, etc. - 89+ MDX resource articles and 18 news posts - Store product images scraped from micromelon.com.au - Quote request API route with Airtable integration - Dynamic back links and cover photos on resource pages - Redesigned downloads page - Fixed corrupted MDX code blocks
281 lines
9.8 KiB
JavaScript
281 lines
9.8 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Scrape all resources from the Squarespace site using the JSON API.
|
|
* Usage: node scripts/scrape-resources.mjs
|
|
*/
|
|
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const CONTENT_DIR = path.join(__dirname, '..', 'content', 'resources');
|
|
const IMAGE_DIR = path.join(__dirname, '..', 'public', 'images', 'resources');
|
|
|
|
fs.mkdirSync(CONTENT_DIR, { recursive: true });
|
|
fs.mkdirSync(IMAGE_DIR, { recursive: true });
|
|
|
|
function parseResourceUrls(xml) {
|
|
const urls = [];
|
|
const regex = /<loc>(https?:\/\/micromelon\.com\.au\/resources\/[^<]+)<\/loc>/g;
|
|
let match;
|
|
while ((match = regex.exec(xml)) !== null) {
|
|
const url = match[1];
|
|
if (url.includes('/category/') || url.includes('/tag/')) continue;
|
|
if (url === 'https://micromelon.com.au/resources') continue;
|
|
urls.push(url);
|
|
}
|
|
return urls;
|
|
}
|
|
|
|
async function fetchPageJson(url) {
|
|
const res = await fetch(`${url}?format=json-pretty`, {
|
|
headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' },
|
|
});
|
|
if (!res.ok) return null;
|
|
const text = await res.text();
|
|
if (!text.trim()) return null;
|
|
try { return JSON.parse(text); } catch { return null; }
|
|
}
|
|
|
|
function sqsBodyToMarkdown(html) {
|
|
if (!html) return '';
|
|
|
|
// Step 1: Extract meaningful content from Squarespace block structure
|
|
// The body is wrapped in sqs-layout divs. We need to find the actual content blocks.
|
|
let md = html;
|
|
|
|
// Remove Squarespace layout/grid wrapper divs but keep sqs-block-content
|
|
// First, extract all image URLs from data-src attributes (Squarespace lazy loading)
|
|
const imageSrcs = [];
|
|
const imgDataSrcRegex = /data-src="([^"]+)"/g;
|
|
let imgMatch;
|
|
while ((imgMatch = imgDataSrcRegex.exec(html)) !== null) {
|
|
const src = imgMatch[1].startsWith('//') ? `https:${imgMatch[1]}` : imgMatch[1];
|
|
imageSrcs.push(src);
|
|
}
|
|
|
|
// Extract video embeds (YouTube, Vimeo, etc.)
|
|
const videos = [];
|
|
const videoRegex = /data-html="([^"]*)"/g;
|
|
let videoMatch;
|
|
while ((videoMatch = videoRegex.exec(html)) !== null) {
|
|
const decoded = videoMatch[1]
|
|
.replace(/</g, '<').replace(/>/g, '>')
|
|
.replace(/&/g, '&').replace(/"/g, '"');
|
|
const srcMatch = decoded.match(/src=["']([^"']+)["']/);
|
|
if (srcMatch) videos.push(srcMatch[1]);
|
|
}
|
|
|
|
// Also find direct iframe embeds
|
|
const iframeRegex = /<iframe[^>]*src=["']([^"']+)["'][^>]*>/gi;
|
|
let iframeMatch;
|
|
while ((iframeMatch = iframeRegex.exec(html)) !== null) {
|
|
if (!videos.includes(iframeMatch[1])) videos.push(iframeMatch[1]);
|
|
}
|
|
|
|
// Now do the HTML-to-markdown conversion
|
|
// Remove all Squarespace-specific wrapper divs, keeping inner content
|
|
// Remove style blocks
|
|
md = md.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
|
|
|
// Remove script blocks
|
|
md = md.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
|
|
|
// Handle Squarespace image blocks - replace with clean markdown images
|
|
// Match the noscript img tags which have the actual src
|
|
md = md.replace(/<noscript><img[^>]*src="([^"]+)"[^>]*\/?><\/noscript>/gi, (m, src) => {
|
|
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
|
return `\n\n\n\n`;
|
|
});
|
|
|
|
// Handle remaining img tags with data-src (lazy loaded)
|
|
md = md.replace(/<img[^>]*data-src="([^"]+)"[^>]*>/gi, (m, src) => {
|
|
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
|
return `\n\n\n\n`;
|
|
});
|
|
|
|
// Handle regular img tags
|
|
md = md.replace(/<img[^>]*src="([^"]+)"[^>]*>/gi, (m, src) => {
|
|
// Skip tracking pixels and tiny images
|
|
if (src.includes('static1.squarespace.com/static') && !src.includes('content/')) return '';
|
|
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
|
return `\n\n\n\n`;
|
|
});
|
|
|
|
// Handle video embeds
|
|
md = md.replace(/<div[^>]*class="[^"]*sqs-video-wrapper[^"]*"[^>]*data-html="([^"]*)"[^>]*>[\s\S]*?<\/div>/gi, (m, encoded) => {
|
|
const decoded = encoded.replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&').replace(/"/g, '"');
|
|
const srcMatch = decoded.match(/src=["']([^"']+)["']/);
|
|
if (srcMatch) {
|
|
return `\n\n<iframe width="560" height="315" src="${srcMatch[1]}" frameBorder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowFullScreen></iframe>\n\n`;
|
|
}
|
|
return '';
|
|
});
|
|
|
|
// Handle iframes
|
|
md = md.replace(/<iframe[^>]*src="([^"]+)"[^>]*>[\s\S]*?<\/iframe>/gi, (m, src) => {
|
|
return `\n\n<iframe width="560" height="315" src="${src}" frameBorder="0" allowFullScreen></iframe>\n\n`;
|
|
});
|
|
|
|
// Headers
|
|
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n\n# $1\n\n');
|
|
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n\n## $1\n\n');
|
|
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n\n### $1\n\n');
|
|
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n\n#### $1\n\n');
|
|
|
|
// Links
|
|
md = md.replace(/<a[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
|
|
|
|
// Bold/italic
|
|
md = md.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, '**$1**');
|
|
md = md.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, '**$1**');
|
|
md = md.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, '*$1*');
|
|
|
|
// Code
|
|
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
|
|
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n\n```\n$1\n```\n\n');
|
|
|
|
// Lists
|
|
md = md.replace(/<\/li>\s*<li[^>]*>/gi, '</li>\n<li>');
|
|
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1');
|
|
md = md.replace(/<\/?[uo]l[^>]*>/gi, '\n');
|
|
|
|
// Paragraphs
|
|
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n\n$1\n\n');
|
|
md = md.replace(/<br\s*\/?>/gi, '\n');
|
|
md = md.replace(/<hr\s*\/?>/gi, '\n\n---\n\n');
|
|
|
|
// Blockquotes
|
|
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, '\n\n> $1\n\n');
|
|
|
|
// Remove all remaining HTML tags
|
|
md = md.replace(/<[^>]+>/g, '');
|
|
|
|
// Decode entities
|
|
md = md.replace(/&/g, '&');
|
|
md = md.replace(/</g, '<');
|
|
md = md.replace(/>/g, '>');
|
|
md = md.replace(/"/g, '"');
|
|
md = md.replace(/'/g, "'");
|
|
md = md.replace(/ /g, ' ');
|
|
md = md.replace(/'/g, "'");
|
|
md = md.replace(///g, '/');
|
|
|
|
// Remove "BACK TO POSTS" link that Squarespace adds
|
|
md = md.replace(/\[?\s*\*?\*?BACK TO POSTS\*?\*?\s*\]?\s*\([^)]*\)/gi, '');
|
|
|
|
// Clean up excessive whitespace
|
|
md = md.replace(/[ \t]+$/gm, ''); // trailing spaces
|
|
md = md.replace(/^\s+$/gm, ''); // whitespace-only lines -> empty
|
|
md = md.replace(/\n{3,}/g, '\n\n'); // max 2 newlines
|
|
md = md.trim();
|
|
|
|
return md;
|
|
}
|
|
|
|
function getCleanExcerpt(item) {
|
|
// Use Squarespace excerpt if available
|
|
if (item.excerpt) {
|
|
return item.excerpt.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim().substring(0, 250);
|
|
}
|
|
// Fallback: extract from body
|
|
const text = (item.body || '').replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
|
|
return text.substring(0, 250);
|
|
}
|
|
|
|
async function downloadImage(url, filename) {
|
|
try {
|
|
const filepath = path.join(IMAGE_DIR, filename);
|
|
if (fs.existsSync(filepath)) return `/images/resources/${filename}`;
|
|
|
|
const res = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0' } });
|
|
if (!res.ok) return null;
|
|
|
|
const buffer = Buffer.from(await res.arrayBuffer());
|
|
if (buffer.length < 100) return null; // skip tiny/empty files
|
|
fs.writeFileSync(filepath, buffer);
|
|
return `/images/resources/${filename}`;
|
|
} catch { return null; }
|
|
}
|
|
|
|
async function main() {
|
|
console.log('Fetching sitemap...');
|
|
const sitemapRes = await fetch('https://www.micromelon.com.au/sitemap.xml', {
|
|
headers: { 'User-Agent': 'Mozilla/5.0' },
|
|
});
|
|
const sitemapXml = await sitemapRes.text();
|
|
const urls = parseResourceUrls(sitemapXml);
|
|
console.log(`Found ${urls.length} resource posts.\n`);
|
|
|
|
// Clean out old files
|
|
for (const f of fs.readdirSync(CONTENT_DIR)) {
|
|
if (f.endsWith('.mdx')) fs.unlinkSync(path.join(CONTENT_DIR, f));
|
|
}
|
|
|
|
let count = 0;
|
|
let skipped = 0;
|
|
|
|
for (const url of urls) {
|
|
const slug = url.split('/resources/')[1];
|
|
if (!slug) continue;
|
|
|
|
count++;
|
|
process.stdout.write(`[${count}/${urls.length}] ${slug}...`);
|
|
|
|
const data = await fetchPageJson(url);
|
|
if (!data || !data.item) {
|
|
console.log(' SKIP (no JSON)');
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
const item = data.item;
|
|
const title = (item.title || slug).replace(/"/g, "'");
|
|
const date = item.publishOn
|
|
? new Date(item.publishOn).toISOString().split('T')[0]
|
|
: new Date(item.addedOn || Date.now()).toISOString().split('T')[0];
|
|
const categories = item.categories || [];
|
|
const tags = item.tags || [];
|
|
const excerpt = getCleanExcerpt(item).replace(/"/g, "'");
|
|
|
|
// Featured image
|
|
let featuredImage = '';
|
|
if (item.assetUrl) {
|
|
const imgUrl = item.assetUrl.startsWith('//') ? `https:${item.assetUrl}` : item.assetUrl;
|
|
const ext = path.extname(imgUrl.split('?')[0]) || '.jpg';
|
|
const localPath = await downloadImage(imgUrl, `${slug}${ext}`);
|
|
if (localPath) featuredImage = localPath;
|
|
}
|
|
|
|
// Convert body
|
|
const content = sqsBodyToMarkdown(item.body || '');
|
|
|
|
// Build frontmatter
|
|
const fm = [
|
|
'---',
|
|
`title: "${title}"`,
|
|
`date: "${date}"`,
|
|
`categories: [${categories.map(c => `"${c}"`).join(', ')}]`,
|
|
`tags: [${tags.map(t => `"${t}"`).join(', ')}]`,
|
|
`excerpt: "${excerpt}"`,
|
|
];
|
|
if (featuredImage) fm.push(`featuredImage: "${featuredImage}"`);
|
|
fm.push('---');
|
|
|
|
fs.writeFileSync(
|
|
path.join(CONTENT_DIR, `${slug}.mdx`),
|
|
fm.join('\n') + '\n\n' + content + '\n'
|
|
);
|
|
|
|
console.log(` OK (${content.length} chars)`);
|
|
await new Promise(r => setTimeout(r, 300));
|
|
}
|
|
|
|
const files = fs.readdirSync(CONTENT_DIR).filter(f => f.endsWith('.mdx'));
|
|
const images = fs.readdirSync(IMAGE_DIR);
|
|
console.log(`\nDone! ${files.length} MDX files, ${images.length} images. Skipped: ${skipped}`);
|
|
}
|
|
|
|
main().catch(console.error);
|