#!/usr/bin/env node /** * Scrape all resources from the Squarespace site using the JSON API. * Usage: node scripts/scrape-resources.mjs */ import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const CONTENT_DIR = path.join(__dirname, '..', 'content', 'resources'); const IMAGE_DIR = path.join(__dirname, '..', 'public', 'images', 'resources'); fs.mkdirSync(CONTENT_DIR, { recursive: true }); fs.mkdirSync(IMAGE_DIR, { recursive: true }); function parseResourceUrls(xml) { const urls = []; const regex = /(https?:\/\/micromelon\.com\.au\/resources\/[^<]+)<\/loc>/g; let match; while ((match = regex.exec(xml)) !== null) { const url = match[1]; if (url.includes('/category/') || url.includes('/tag/')) continue; if (url === 'https://micromelon.com.au/resources') continue; urls.push(url); } return urls; } async function fetchPageJson(url) { const res = await fetch(`${url}?format=json-pretty`, { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' }, }); if (!res.ok) return null; const text = await res.text(); if (!text.trim()) return null; try { return JSON.parse(text); } catch { return null; } } function sqsBodyToMarkdown(html) { if (!html) return ''; // Step 1: Extract meaningful content from Squarespace block structure // The body is wrapped in sqs-layout divs. We need to find the actual content blocks. let md = html; // Remove Squarespace layout/grid wrapper divs but keep sqs-block-content // First, extract all image URLs from data-src attributes (Squarespace lazy loading) const imageSrcs = []; const imgDataSrcRegex = /data-src="([^"]+)"/g; let imgMatch; while ((imgMatch = imgDataSrcRegex.exec(html)) !== null) { const src = imgMatch[1].startsWith('//') ? `https:${imgMatch[1]}` : imgMatch[1]; imageSrcs.push(src); } // Extract video embeds (YouTube, Vimeo, etc.) const videos = []; const videoRegex = /data-html="([^"]*)"/g; let videoMatch; while ((videoMatch = videoRegex.exec(html)) !== null) { const decoded = videoMatch[1] .replace(/</g, '<').replace(/>/g, '>') .replace(/&/g, '&').replace(/"/g, '"'); const srcMatch = decoded.match(/src=["']([^"']+)["']/); if (srcMatch) videos.push(srcMatch[1]); } // Also find direct iframe embeds const iframeRegex = /]*src=["']([^"']+)["'][^>]*>/gi; let iframeMatch; while ((iframeMatch = iframeRegex.exec(html)) !== null) { if (!videos.includes(iframeMatch[1])) videos.push(iframeMatch[1]); } // Now do the HTML-to-markdown conversion // Remove all Squarespace-specific wrapper divs, keeping inner content // Remove style blocks md = md.replace(/]*>[\s\S]*?<\/style>/gi, ''); // Remove script blocks md = md.replace(/]*>[\s\S]*?<\/script>/gi, ''); // Handle Squarespace image blocks - replace with clean markdown images // Match the noscript img tags which have the actual src md = md.replace(/