#!/usr/bin/env node /** * Scrape all resources from the Squarespace site using the JSON API. * Usage: node scripts/scrape-resources.mjs */ import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const CONTENT_DIR = path.join(__dirname, '..', 'content', 'resources'); const IMAGE_DIR = path.join(__dirname, '..', 'public', 'images', 'resources'); fs.mkdirSync(CONTENT_DIR, { recursive: true }); fs.mkdirSync(IMAGE_DIR, { recursive: true }); function parseResourceUrls(xml) { const urls = []; const regex = /(https?:\/\/micromelon\.com\.au\/resources\/[^<]+)<\/loc>/g; let match; while ((match = regex.exec(xml)) !== null) { const url = match[1]; if (url.includes('/category/') || url.includes('/tag/')) continue; if (url === 'https://micromelon.com.au/resources') continue; urls.push(url); } return urls; } async function fetchPageJson(url) { const res = await fetch(`${url}?format=json-pretty`, { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' }, }); if (!res.ok) return null; const text = await res.text(); if (!text.trim()) return null; try { return JSON.parse(text); } catch { return null; } } function sqsBodyToMarkdown(html) { if (!html) return ''; // Step 1: Extract meaningful content from Squarespace block structure // The body is wrapped in sqs-layout divs. We need to find the actual content blocks. let md = html; // Remove Squarespace layout/grid wrapper divs but keep sqs-block-content // First, extract all image URLs from data-src attributes (Squarespace lazy loading) const imageSrcs = []; const imgDataSrcRegex = /data-src="([^"]+)"/g; let imgMatch; while ((imgMatch = imgDataSrcRegex.exec(html)) !== null) { const src = imgMatch[1].startsWith('//') ? `https:${imgMatch[1]}` : imgMatch[1]; imageSrcs.push(src); } // Extract video embeds (YouTube, Vimeo, etc.) const videos = []; const videoRegex = /data-html="([^"]*)"/g; let videoMatch; while ((videoMatch = videoRegex.exec(html)) !== null) { const decoded = videoMatch[1] .replace(/</g, '<').replace(/>/g, '>') .replace(/&/g, '&').replace(/"/g, '"'); const srcMatch = decoded.match(/src=["']([^"']+)["']/); if (srcMatch) videos.push(srcMatch[1]); } // Also find direct iframe embeds const iframeRegex = /]*src=["']([^"']+)["'][^>]*>/gi; let iframeMatch; while ((iframeMatch = iframeRegex.exec(html)) !== null) { if (!videos.includes(iframeMatch[1])) videos.push(iframeMatch[1]); } // Now do the HTML-to-markdown conversion // Remove all Squarespace-specific wrapper divs, keeping inner content // Remove style blocks md = md.replace(/]*>[\s\S]*?<\/style>/gi, ''); // Remove script blocks md = md.replace(/]*>[\s\S]*?<\/script>/gi, ''); // Handle Squarespace image blocks - replace with clean markdown images // Match the noscript img tags which have the actual src md = md.replace(/

]*src="([^"]+)"[^>]*\/?><\/noscript>/gi, (m, src) => { const cleanSrc = src.startsWith('//') ? `https:${src}` : src; return `\n\n![](${cleanSrc})\n\n`; }); // Handle remaining img tags with data-src (lazy loaded) md = md.replace(/]*data-src="([^"]+)"[^>]*>/gi, (m, src) => { const cleanSrc = src.startsWith('//') ? `https:${src}` : src; return `\n\n![](${cleanSrc})\n\n`; }); // Handle regular img tags md = md.replace(/]*src="([^"]+)"[^>]*>/gi, (m, src) => { // Skip tracking pixels and tiny images if (src.includes('static1.squarespace.com/static') && !src.includes('content/')) return ''; const cleanSrc = src.startsWith('//') ? `https:${src}` : src; return `\n\n![](${cleanSrc})\n\n`; }); // Handle video embeds md = md.replace(/]*class="[^"]*sqs-video-wrapper[^"]*"[^>]*data-html="([^"]*)"[^>]*>[\s\S]*?<\/div>/gi, (m, encoded) => { const decoded = encoded.replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&').replace(/"/g, '"'); const srcMatch = decoded.match(/src=["']([^"']+)["']/); if (srcMatch) { return `\n\n\n\n`; } return ''; }); // Handle iframes md = md.replace(/]*src="([^"]+)"[^>]*>[\s\S]*?<\/iframe>/gi, (m, src) => { return `\n\n\n\n`; }); // Headers md = md.replace(/]*>([\s\S]*?)<\/h1>/gi, '\n\n# $1\n\n'); md = md.replace(/]*>([\s\S]*?)<\/h2>/gi, '\n\n## $1\n\n'); md = md.replace(/]*>([\s\S]*?)<\/h3>/gi, '\n\n### $1\n\n'); md = md.replace(/]*>([\s\S]*?)<\/h4>/gi, '\n\n#### $1\n\n'); // Links md = md.replace(/]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)'); // Bold/italic md = md.replace(/]*>([\s\S]*?)<\/strong>/gi, '**$1**'); md = md.replace(/]*>([\s\S]*?)<\/b>/gi, '**$1**'); md = md.replace(/]*>([\s\S]*?)<\/em>/gi, '*$1*'); // Code md = md.replace(/]*>([\s\S]*?)<\/code>/gi, '`$1`'); md = md.replace(/]*>([\s\S]*?)<\/pre>/gi, '\n\n```\n$1\n```\n\n'); // Lists md = md.replace(/<\/li>\s*]*>/gi, '\n

'); md = md.replace(/]*>([\s\S]*?)<\/li>/gi, '- $1'); md = md.replace(/<\/?[uo]l[^>]*>/gi, '\n'); // Paragraphs md = md.replace(/]*>([\s\S]*?)<\/p>/gi, '\n\n$1\n\n'); md = md.replace(//gi, '\n'); md = md.replace(//gi, '\n\n---\n\n'); // Blockquotes md = md.replace(/]*>([\s\S]*?)<\/blockquote>/gi, '\n\n> $1\n\n'); // Remove all remaining HTML tags md = md.replace(/<[^>]+>/g, ''); // Decode entities md = md.replace(/&/g, '&'); md = md.replace(/</g, '<'); md = md.replace(/>/g, '>'); md = md.replace(/"/g, '"'); md = md.replace(/'/g, "'"); md = md.replace(/ /g, ' '); md = md.replace(/'/g, "'"); md = md.replace(///g, '/'); // Remove "BACK TO POSTS" link that Squarespace adds md = md.replace(/\[?\s*\*?\*?BACK TO POSTS\*?\*?\s*\]?\s*\([^)]*\)/gi, ''); // Clean up excessive whitespace md = md.replace(/[ \t]+$/gm, ''); // trailing spaces md = md.replace(/^\s+$/gm, ''); // whitespace-only lines -> empty md = md.replace(/\n{3,}/g, '\n\n'); // max 2 newlines md = md.trim(); return md; } function getCleanExcerpt(item) { // Use Squarespace excerpt if available if (item.excerpt) { return item.excerpt.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim().substring(0, 250); } // Fallback: extract from body const text = (item.body || '').replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim(); return text.substring(0, 250); } async function downloadImage(url, filename) { try { const filepath = path.join(IMAGE_DIR, filename); if (fs.existsSync(filepath)) return `/images/resources/${filename}`; const res = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0' } }); if (!res.ok) return null; const buffer = Buffer.from(await res.arrayBuffer()); if (buffer.length < 100) return null; // skip tiny/empty files fs.writeFileSync(filepath, buffer); return `/images/resources/${filename}`; } catch { return null; } } async function main() { console.log('Fetching sitemap...'); const sitemapRes = await fetch('https://www.micromelon.com.au/sitemap.xml', { headers: { 'User-Agent': 'Mozilla/5.0' }, }); const sitemapXml = await sitemapRes.text(); const urls = parseResourceUrls(sitemapXml); console.log(`Found ${urls.length} resource posts.\n`); // Clean out old files for (const f of fs.readdirSync(CONTENT_DIR)) { if (f.endsWith('.mdx')) fs.unlinkSync(path.join(CONTENT_DIR, f)); } let count = 0; let skipped = 0; for (const url of urls) { const slug = url.split('/resources/')[1]; if (!slug) continue; count++; process.stdout.write(`[${count}/${urls.length}] ${slug}...`); const data = await fetchPageJson(url); if (!data || !data.item) { console.log(' SKIP (no JSON)'); skipped++; continue; } const item = data.item; const title = (item.title || slug).replace(/"/g, "'"); const date = item.publishOn ? new Date(item.publishOn).toISOString().split('T')[0] : new Date(item.addedOn || Date.now()).toISOString().split('T')[0]; const categories = item.categories || []; const tags = item.tags || []; const excerpt = getCleanExcerpt(item).replace(/"/g, "'"); // Featured image let featuredImage = ''; if (item.assetUrl) { const imgUrl = item.assetUrl.startsWith('//') ? `https:${item.assetUrl}` : item.assetUrl; const ext = path.extname(imgUrl.split('?')[0]) || '.jpg'; const localPath = await downloadImage(imgUrl, `${slug}${ext}`); if (localPath) featuredImage = localPath; } // Convert body const content = sqsBodyToMarkdown(item.body || ''); // Build frontmatter const fm = [ '---', `title: "${title}"`, `date: "${date}"`, `categories: [${categories.map(c => `"${c}"`).join(', ')}]`, `tags: [${tags.map(t => `"${t}"`).join(', ')}]`, `excerpt: "${excerpt}"`, ]; if (featuredImage) fm.push(`featuredImage: "${featuredImage}"`); fm.push('---'); fs.writeFileSync( path.join(CONTENT_DIR, `${slug}.mdx`), fm.join('\n') + '\n\n' + content + '\n' ); console.log(` OK (${content.length} chars)`); await new Promise(r => setTimeout(r, 300)); } const files = fs.readdirSync(CONTENT_DIR).filter(f => f.endsWith('.mdx')); const images = fs.readdirSync(IMAGE_DIR); console.log(`\nDone! ${files.length} MDX files, ${images.length} images. Skipped: ${skipped}`); } main().catch(console.error);