Build Your Kit page and full Micromelon website
Complete website build including: - Build Your Kit store page with cart system, sectioned layout (Hardware, Software, Attachments, Spare Parts), inline quote request form, and sticky sidebar summary - 16+ pages: Education, Platform, Resources, News, About Us, Download, Contact, Rover, Code Editor, Robot Simulator, etc. - 89+ MDX resource articles and 18 news posts - Store product images scraped from micromelon.com.au - Quote request API route with Airtable integration - Dynamic back links and cover photos on resource pages - Redesigned downloads page - Fixed corrupted MDX code blocks
This commit is contained in:
45
scripts/cleanup-mdx.mjs
Normal file
45
scripts/cleanup-mdx.mjs
Normal file
@@ -0,0 +1,45 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const dir = path.join(__dirname, '..', 'content', 'resources');
|
||||
|
||||
let fixed = 0;
|
||||
for (const f of fs.readdirSync(dir).filter(f => f.endsWith('.mdx'))) {
|
||||
const filepath = path.join(dir, f);
|
||||
const original = fs.readFileSync(filepath, 'utf8');
|
||||
let content = original;
|
||||
|
||||
// Clean excerpts that contain HTML/CSS artifacts
|
||||
const excerptMatch = content.match(/^excerpt: "(.*)"$/m);
|
||||
if (excerptMatch) {
|
||||
let excerpt = excerptMatch[1];
|
||||
excerpt = excerpt.replace(/#block-[^\s]+ \{[^}]*\}/g, '');
|
||||
excerpt = excerpt.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
||||
excerpt = excerpt.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
||||
excerpt = excerpt.replace(/\*\*/g, '');
|
||||
excerpt = excerpt.replace(/\s+/g, ' ').trim().substring(0, 250);
|
||||
if (excerpt !== excerptMatch[1]) {
|
||||
content = content.replace(excerptMatch[0], `excerpt: "${excerpt}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove BACK TO POSTS lines
|
||||
content = content.replace(/\[?\s*\*?\*?BACK TO POSTS\s*\*?\*?\s*\]?\s*\([^)]*\)\s*(\*?\*?\s*\|?\s*\*?\*?\s*\[?\*?\*?OPEN PAGE AS PDF\*?\*?\]?\([^)]*\))?/gi, '');
|
||||
|
||||
// Remove --> artifacts
|
||||
content = content.replace(/^-->\s*$/gm, '');
|
||||
|
||||
// Remove CSS block definitions
|
||||
content = content.replace(/#block-[a-zA-Z0-9_-]+ \{[^}]*\}/g, '');
|
||||
|
||||
// Clean excessive blank lines
|
||||
content = content.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
if (content !== original) {
|
||||
fs.writeFileSync(filepath, content);
|
||||
fixed++;
|
||||
}
|
||||
}
|
||||
console.log(`Cleaned ${fixed} files`);
|
||||
56
scripts/download-assets.sh
Normal file
56
scripts/download-assets.sh
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/bin/bash
|
||||
# Download assets from current Squarespace site
|
||||
BASE="https://images.squarespace-cdn.com/content/v1/60a43bf842d7b601064a8828"
|
||||
OUT="public/images"
|
||||
|
||||
# Founders
|
||||
curl -sL "$BASE/207a7c93-62e4-4cce-960e-3aac9d0f326d/adam.png" -o "$OUT/founders/adam.png"
|
||||
curl -sL "$BASE/1a6645b4-9e27-4b49-b8ff-c3644c358f4c/tim.png" -o "$OUT/founders/tim.png"
|
||||
|
||||
# Partner logos
|
||||
curl -sL "$BASE/1633901579287-HDYLZ442KFVAZWZCPSXI/codingathome.png" -o "$OUT/partners/codingathome.png"
|
||||
curl -sL "$BASE/1633901627921-UKKWATKWSZF65M8PILDY/worldsciencefair.png" -o "$OUT/partners/worldsciencefair.png"
|
||||
curl -sL "$BASE/1633901657858-W25D5TI13OTSG76POUDS/cisco.png" -o "$OUT/partners/cisco.png"
|
||||
curl -sL "$BASE/1633901706899-JBER9BFFNRKAW6ZF0KCH/uq-logo-lockup-purple.png" -o "$OUT/partners/uq.png"
|
||||
curl -sL "$BASE/1633901726291-HK73RNB7LCNZ17IXQYMC/shapelabs.png" -o "$OUT/partners/shapelabs.png"
|
||||
curl -sL "$BASE/1633901745735-MR5B6UE4A89BLW9H4PLM/roborave.png" -o "$OUT/partners/roborave.png"
|
||||
curl -sL "$BASE/1634585888937-VZJBKXYBI5V28XGXN8T9/robo-gals-new.png" -o "$OUT/partners/robogals.png"
|
||||
curl -sL "$BASE/961ac034-e08c-4585-aeee-90cff217a3cc/Screen-Shot-2022-01-19-at-2.41.17-pm.png" -o "$OUT/partners/somerset.png"
|
||||
curl -sL "$BASE/4a1c8c44-deba-49fe-bed9-100bced46984/MARS-logo.png" -o "$OUT/partners/mars.png"
|
||||
curl -sL "$BASE/1633912027308-Z2PUHHVZR07R4A55Z1NJ/queensland+robotics.png" -o "$OUT/partners/queensland-robotics.png"
|
||||
curl -sL "$BASE/24345bac-45df-4625-bbe9-4c2b9ddc2bbd/001.jpg" -o "$OUT/partners/nambour.jpg"
|
||||
curl -sL "$BASE/2b8da32b-f29f-4fbc-a69f-62b674ff6455/HFSDesign_Logo1-ColourBlack.png" -o "$OUT/partners/hfs-design.png"
|
||||
curl -sL "$BASE/42bb721d-a1de-48a5-b595-1a6ee0809d31/reddirtrobotics%2Blogo.jpg" -o "$OUT/partners/reddirtrobotics.jpg"
|
||||
curl -sL "$BASE/102b1693-2145-4aea-8012-4b41f4b101f7/195909638_151774876989903_7462155829817761256_n.jpg" -o "$OUT/partners/stpeters.jpg"
|
||||
curl -sL "$BASE/74593547-155c-41d6-a3ec-c9f2870043e0/1631322817046.jpeg" -o "$OUT/partners/bgs.jpeg"
|
||||
curl -sL "$BASE/298bbc5f-49ee-40c1-aa0c-255bca5d6f12/GISP_black_logo.png" -o "$OUT/partners/gisp.png"
|
||||
|
||||
# Award badges
|
||||
curl -sL "$BASE/f2922da5-e964-497d-8e45-bfde14f9a16f/body-1-good-design-award_gold-winner_logo-jpg-ximg_-l_8_m-smart_-e1599799902729.jpg" -o "$OUT/awards/good-design.jpg"
|
||||
curl -sL "$BASE/1633901132092-1CSSNZ4JPWFXQG9XDGP4/advanced-queensland.png" -o "$OUT/awards/advance-queensland.png"
|
||||
curl -sL "$BASE/1633901205930-0TMN1CR2WD3UFNKMOT15/youngstarterslogo.png" -o "$OUT/awards/young-starters.png"
|
||||
curl -sL "$BASE/58a43758-897b-4821-92f8-16f21b3a7bab/Australian-Made-Owned-full-colour-logo.jpg" -o "$OUT/awards/australian-made.jpg"
|
||||
curl -sL "$BASE/1633901223722-ULL99PJTYXHRA7W8VQQ7/icralogo.png" -o "$OUT/awards/icra.png"
|
||||
curl -sL "$BASE/1633901251644-ZCCMHUZF7QIK8NXWNKZI/blueboxlogo.png" -o "$OUT/awards/bluebox.png"
|
||||
curl -sL "$BASE/b205cc03-1282-44ef-8e94-9a99037dbca5/PL.jpg" -o "$OUT/awards/pl.jpg"
|
||||
|
||||
# Product/Platform images
|
||||
curl -sL "$BASE/d7e06c90-5ae8-4a25-aa51-695b63542b76/platform.png" -o "$OUT/hero/platform.png"
|
||||
curl -sL "$BASE/15c6cb9e-36ed-4b8f-96e6-4485ec587d77/rover-candid.png" -o "$OUT/products/rover-candid.png"
|
||||
curl -sL "$BASE/a738f6d7-a969-445f-982c-71d9f80943b1/rover-big.png" -o "$OUT/products/rover-big.png"
|
||||
curl -sL "$BASE/afa456f1-1da2-4e8f-9a13-44503985bf1d/driving-school-duckie-town.png" -o "$OUT/hero/driving-school.png"
|
||||
curl -sL "$BASE/e3de22cd-11b8-4bf4-87c1-155d84607e84/simulator-big.png" -o "$OUT/products/simulator-big.png"
|
||||
curl -sL "$BASE/1d1b50df-b357-4eb2-8e83-4a7ea36152ae/code-editor-big.png" -o "$OUT/products/code-editor-big.png"
|
||||
curl -sL "$BASE/1633409192147-YZB8VCH6IQVIX6JJ00R7/class-management.png" -o "$OUT/products/class-management.png"
|
||||
curl -sL "$BASE/b8018b7c-dd04-4b0e-a77f-5d648be55b85/amo-2021-logo-rgb-full-colour-clear.png" -o "$OUT/awards/australian-made-owned.png"
|
||||
curl -sL "$BASE/04fae17e-865c-4f22-af38-fe8c023406a4/KeyShot+Renders.70.jpg" -o "$OUT/products/rover-render.jpg"
|
||||
curl -sL "$BASE/1633397571502-7P7WRMB5N3X0WDVSV1K4/rover.png" -o "$OUT/products/rover.png"
|
||||
curl -sL "$BASE/1633398402343-BCFYMKQICJQC5M34D296/rover-digger-full.png" -o "$OUT/products/rover-digger.png"
|
||||
curl -sL "$BASE/1633398257202-RKGBJDDOEVTIYVGZWDG3/cords.png" -o "$OUT/products/cords.png"
|
||||
curl -sL "$BASE/1633398231053-6EF4N4ZGLIG92FXJONTI/case.png" -o "$OUT/products/case.png"
|
||||
curl -sL "$BASE/1633400391317-34X5T9TL6I1ASWIZYH0X/image-asset.png" -o "$OUT/products/rover-specs.png"
|
||||
curl -sL "$BASE/1633988093034-A27ZE2HMSHGGDK7VDSM9/header+video+with+laptop.gif" -o "$OUT/hero/code-editor-demo.gif"
|
||||
curl -sL "$BASE/514f66e4-0c76-4ca8-8451-b38d0c4b39e6/2021-12-16_16-01-19_1639630906.jpg" -o "$OUT/hero/3d-printing.jpg"
|
||||
curl -sL "$BASE/1633402428790-4PHP6JHFUYICKH614XQW/from-rover-to-software.gif" -o "$OUT/hero/rover-to-software.gif"
|
||||
|
||||
echo "Done downloading assets"
|
||||
111
scripts/fix-all-mdx.mjs
Normal file
111
scripts/fix-all-mdx.mjs
Normal file
@@ -0,0 +1,111 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const dir = path.join(__dirname, '..', 'content', 'resources');
|
||||
|
||||
let fixed = 0;
|
||||
for (const f of fs.readdirSync(dir).filter(f => f.endsWith('.mdx'))) {
|
||||
const filepath = path.join(dir, f);
|
||||
const original = fs.readFileSync(filepath, 'utf8');
|
||||
|
||||
// Split frontmatter from content
|
||||
const secondDash = original.indexOf('---', 4);
|
||||
if (secondDash === -1) continue;
|
||||
let frontmatter = original.substring(0, secondDash + 3);
|
||||
let content = original.substring(secondDash + 3);
|
||||
|
||||
// === Fix body content ===
|
||||
|
||||
// Remove BACK TO POSTS and OPEN PAGE AS PDF
|
||||
content = content.replace(/\[\s*\*?\*?\s*BACK TO POSTS\s*\*?\*?\s*\]\([^)]*\)/gi, '');
|
||||
content = content.replace(/\*?\*?\s*BACK TO POSTS\s*\*?\*?/gi, '');
|
||||
content = content.replace(/\|?\s*\[?\s*\*?\*?\s*OPEN PAGE AS PDF\s*\*?\*?\s*\]?\s*\([^)]*\)/gi, '');
|
||||
|
||||
// Remove CSS block definitions
|
||||
content = content.replace(/#block-[a-zA-Z0-9_-]+ \{[^}]*?\}/g, '');
|
||||
|
||||
// Remove --> artifacts
|
||||
content = content.replace(/^-->\s*$/gm, '');
|
||||
|
||||
// Escape ALL < that aren't part of:
|
||||
// 1. Markdown images:  - these don't have <
|
||||
// 2. iframe tags: <iframe ... > or </iframe>
|
||||
// 3. Already escaped: \<
|
||||
// Strategy: protect safe tags, escape everything else, restore safe tags
|
||||
|
||||
// Temporarily replace safe tags with placeholders
|
||||
const safeTags = [];
|
||||
content = content.replace(/<(\/?)iframe([^>]*)>/gi, (match) => {
|
||||
safeTags.push(match);
|
||||
return `__SAFE_TAG_${safeTags.length - 1}__`;
|
||||
});
|
||||
content = content.replace(/<(\/?)video([^>]*)>/gi, (match) => {
|
||||
safeTags.push(match);
|
||||
return `__SAFE_TAG_${safeTags.length - 1}__`;
|
||||
});
|
||||
content = content.replace(/<source([^>]*)>/gi, (match) => {
|
||||
safeTags.push(match);
|
||||
return `__SAFE_TAG_${safeTags.length - 1}__`;
|
||||
});
|
||||
|
||||
// Now escape all remaining < and > that aren't in image markdown
|
||||
// Split by lines to handle image markdown
|
||||
const lines = content.split('\n');
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
// Skip lines that are image markdown
|
||||
if (line.trim().startsWith('![')) continue;
|
||||
// Skip lines that only contain safe tag placeholders
|
||||
if (line.trim().match(/^__SAFE_TAG_\d+__$/)) continue;
|
||||
|
||||
// Escape < and > that aren't already escaped or part of placeholders
|
||||
lines[i] = line.replace(/<(?!_SAFE_TAG)/g, (match, offset) => {
|
||||
// Check if already escaped
|
||||
if (offset > 0 && line[offset - 1] === '\\') return match;
|
||||
return '<';
|
||||
});
|
||||
lines[i] = lines[i].replace(/>(?!_)/g, (match, offset) => {
|
||||
const before = lines[i].substring(0, offset);
|
||||
// Don't escape if it's after a safe tag placeholder
|
||||
if (before.match(/__SAFE_TAG_\d+$/)) return match;
|
||||
// Don't escape if already escaped
|
||||
if (offset > 0 && lines[i][offset - 1] === '\\') return match;
|
||||
// Don't escape markdown blockquotes at start of line
|
||||
if (before.match(/^\s*$/)) return match;
|
||||
return '>';
|
||||
});
|
||||
}
|
||||
content = lines.join('\n');
|
||||
|
||||
// Restore safe tags
|
||||
for (let i = 0; i < safeTags.length; i++) {
|
||||
content = content.replace(`__SAFE_TAG_${i}__`, safeTags[i]);
|
||||
}
|
||||
|
||||
// Clean excessive blank lines
|
||||
content = content.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
// === Fix frontmatter excerpt ===
|
||||
const excerptMatch = frontmatter.match(/^excerpt: "(.*)"$/m);
|
||||
if (excerptMatch) {
|
||||
let excerpt = excerptMatch[1];
|
||||
excerpt = excerpt.replace(/\*?\*?BACK TO POSTS\*?\*?/gi, '');
|
||||
excerpt = excerpt.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
|
||||
excerpt = excerpt.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
||||
excerpt = excerpt.replace(/\*\*/g, '');
|
||||
excerpt = excerpt.replace(/#block-[^\s]+ \{[^}]*?\}/g, '');
|
||||
excerpt = excerpt.replace(/\s*\|\s*/g, ' ');
|
||||
excerpt = excerpt.replace(/\s+/g, ' ').trim();
|
||||
excerpt = excerpt.substring(0, 250);
|
||||
frontmatter = frontmatter.replace(excerptMatch[0], `excerpt: "${excerpt}"`);
|
||||
}
|
||||
|
||||
const result = frontmatter + content;
|
||||
if (result !== original) {
|
||||
fs.writeFileSync(filepath, result);
|
||||
fixed++;
|
||||
}
|
||||
}
|
||||
console.log(`Fixed ${fixed} files`);
|
||||
37
scripts/fix-bullets.mjs
Normal file
37
scripts/fix-bullets.mjs
Normal file
@@ -0,0 +1,37 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const dir = path.join(__dirname, '..', 'content', 'resources');
|
||||
|
||||
let fixed = 0;
|
||||
for (const f of fs.readdirSync(dir).filter(f => f.endsWith('.mdx'))) {
|
||||
const filepath = path.join(dir, f);
|
||||
const original = fs.readFileSync(filepath, 'utf8');
|
||||
|
||||
// Split frontmatter from content
|
||||
const secondDash = original.indexOf('---', 4);
|
||||
if (secondDash === -1) continue;
|
||||
const frontmatter = original.substring(0, secondDash + 3);
|
||||
let content = original.substring(secondDash + 3);
|
||||
|
||||
// Fix broken bullet lists where "-" is on its own line followed by blank line then content
|
||||
// Pattern: line with just "-", blank line, then content text
|
||||
// Should become: "- content text"
|
||||
content = content.replace(/^- *\n\n(.+)/gm, '- $1');
|
||||
|
||||
// Also fix numbered lists with same pattern: "1." on own line, blank, content
|
||||
content = content.replace(/^(\d+)\. *\n\n(.+)/gm, '$1. $2');
|
||||
|
||||
// Clean up any resulting triple+ blank lines
|
||||
content = content.replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
const result = frontmatter + content;
|
||||
if (result !== original) {
|
||||
fs.writeFileSync(filepath, result);
|
||||
fixed++;;
|
||||
console.log(` Fixed: ${f}`);
|
||||
}
|
||||
}
|
||||
console.log(`Fixed bullet lists in ${fixed} files`);
|
||||
59
scripts/fix-code-blocks.mjs
Normal file
59
scripts/fix-code-blocks.mjs
Normal file
@@ -0,0 +1,59 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const contentDir = path.join(__dirname, "..", "content", "resources");
|
||||
|
||||
const files = fs.readdirSync(contentDir).filter((f) => f.endsWith(".mdx"));
|
||||
|
||||
let totalFixed = 0;
|
||||
|
||||
for (const file of files) {
|
||||
const filePath = path.join(contentDir, file);
|
||||
const raw = fs.readFileSync(filePath, "utf-8");
|
||||
|
||||
// Split frontmatter from content
|
||||
const fmMatch = raw.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
||||
if (!fmMatch) continue;
|
||||
|
||||
const frontmatter = fmMatch[1];
|
||||
const content = fmMatch[2];
|
||||
|
||||
const lines = content.split("\n");
|
||||
let insideFencedBlock = false;
|
||||
let changed = false;
|
||||
const result = [];
|
||||
|
||||
for (let line of lines) {
|
||||
// Toggle fenced code block state
|
||||
if (/^```/.test(line.trimStart())) {
|
||||
insideFencedBlock = !insideFencedBlock;
|
||||
result.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (insideFencedBlock) {
|
||||
// Inside fenced code block: unescape HTML entities
|
||||
const newLine = line.replace(/</g, "<").replace(/>/g, ">");
|
||||
if (newLine !== line) changed = true;
|
||||
result.push(newLine);
|
||||
} else {
|
||||
// Outside fenced block: fix inline code spans
|
||||
const newLine = line.replace(/`[^`]+`/g, (match) => {
|
||||
return match.replace(/</g, "<").replace(/>/g, ">");
|
||||
});
|
||||
if (newLine !== line) changed = true;
|
||||
result.push(newLine);
|
||||
}
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
const output = `---\n${frontmatter}\n---\n${result.join("\n")}`;
|
||||
fs.writeFileSync(filePath, output, "utf-8");
|
||||
totalFixed++;
|
||||
console.log(`Fixed: ${file}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nDone. Fixed ${totalFixed} files.`);
|
||||
52
scripts/fix-mdx-jsx.mjs
Normal file
52
scripts/fix-mdx-jsx.mjs
Normal file
@@ -0,0 +1,52 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const dir = path.join(__dirname, '..', 'content', 'resources');
|
||||
|
||||
// Known safe HTML/JSX tags that MDX should handle
|
||||
const safeTags = new Set([
|
||||
'iframe', 'video', 'source', 'br', 'hr', 'img',
|
||||
'div', 'span', 'p', 'a',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'ul', 'ol', 'li',
|
||||
'strong', 'em', 'b', 'i', 'u',
|
||||
'code', 'pre',
|
||||
'blockquote',
|
||||
'table', 'tr', 'td', 'th', 'thead', 'tbody',
|
||||
]);
|
||||
|
||||
let fixed = 0;
|
||||
for (const f of fs.readdirSync(dir).filter(f => f.endsWith('.mdx'))) {
|
||||
const filepath = path.join(dir, f);
|
||||
const original = fs.readFileSync(filepath, 'utf8');
|
||||
|
||||
// Split into frontmatter and content
|
||||
const fmEnd = original.indexOf('---', 4);
|
||||
if (fmEnd === -1) continue;
|
||||
const frontmatter = original.substring(0, fmEnd + 3);
|
||||
let content = original.substring(fmEnd + 3);
|
||||
|
||||
// Escape angle brackets that look like HTML tags but aren't valid JSX
|
||||
// Match < followed by something that's not a known safe tag or /
|
||||
content = content.replace(/<(?!\/?(?:iframe|video|source|br|hr|img)\b)([^>]*?)>/g, (match, inner) => {
|
||||
// Keep markdown image syntax  - these don't match since they start with !
|
||||
// Keep valid looking JSX/HTML
|
||||
const tagName = inner.split(/[\s/]/)[0].toLowerCase();
|
||||
if (safeTags.has(tagName) || tagName.startsWith('!--')) return match;
|
||||
// It's a stray angle bracket - escape it
|
||||
return `\\<${inner}\\>`;
|
||||
});
|
||||
|
||||
// Also escape bare < that aren't part of tags (like math: x < y)
|
||||
// But be careful not to double-escape
|
||||
content = content.replace(/([^\\])<([^/!a-zA-Z\\])/g, '$1\\<$2');
|
||||
|
||||
const result = frontmatter + content;
|
||||
if (result !== original) {
|
||||
fs.writeFileSync(filepath, result);
|
||||
fixed++;
|
||||
}
|
||||
}
|
||||
console.log(`Fixed JSX issues in ${fixed} files`);
|
||||
280
scripts/scrape-resources.mjs
Normal file
280
scripts/scrape-resources.mjs
Normal file
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Scrape all resources from the Squarespace site using the JSON API.
|
||||
* Usage: node scripts/scrape-resources.mjs
|
||||
*/
|
||||
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const CONTENT_DIR = path.join(__dirname, '..', 'content', 'resources');
|
||||
const IMAGE_DIR = path.join(__dirname, '..', 'public', 'images', 'resources');
|
||||
|
||||
fs.mkdirSync(CONTENT_DIR, { recursive: true });
|
||||
fs.mkdirSync(IMAGE_DIR, { recursive: true });
|
||||
|
||||
function parseResourceUrls(xml) {
|
||||
const urls = [];
|
||||
const regex = /<loc>(https?:\/\/micromelon\.com\.au\/resources\/[^<]+)<\/loc>/g;
|
||||
let match;
|
||||
while ((match = regex.exec(xml)) !== null) {
|
||||
const url = match[1];
|
||||
if (url.includes('/category/') || url.includes('/tag/')) continue;
|
||||
if (url === 'https://micromelon.com.au/resources') continue;
|
||||
urls.push(url);
|
||||
}
|
||||
return urls;
|
||||
}
|
||||
|
||||
async function fetchPageJson(url) {
|
||||
const res = await fetch(`${url}?format=json-pretty`, {
|
||||
headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' },
|
||||
});
|
||||
if (!res.ok) return null;
|
||||
const text = await res.text();
|
||||
if (!text.trim()) return null;
|
||||
try { return JSON.parse(text); } catch { return null; }
|
||||
}
|
||||
|
||||
function sqsBodyToMarkdown(html) {
|
||||
if (!html) return '';
|
||||
|
||||
// Step 1: Extract meaningful content from Squarespace block structure
|
||||
// The body is wrapped in sqs-layout divs. We need to find the actual content blocks.
|
||||
let md = html;
|
||||
|
||||
// Remove Squarespace layout/grid wrapper divs but keep sqs-block-content
|
||||
// First, extract all image URLs from data-src attributes (Squarespace lazy loading)
|
||||
const imageSrcs = [];
|
||||
const imgDataSrcRegex = /data-src="([^"]+)"/g;
|
||||
let imgMatch;
|
||||
while ((imgMatch = imgDataSrcRegex.exec(html)) !== null) {
|
||||
const src = imgMatch[1].startsWith('//') ? `https:${imgMatch[1]}` : imgMatch[1];
|
||||
imageSrcs.push(src);
|
||||
}
|
||||
|
||||
// Extract video embeds (YouTube, Vimeo, etc.)
|
||||
const videos = [];
|
||||
const videoRegex = /data-html="([^"]*)"/g;
|
||||
let videoMatch;
|
||||
while ((videoMatch = videoRegex.exec(html)) !== null) {
|
||||
const decoded = videoMatch[1]
|
||||
.replace(/</g, '<').replace(/>/g, '>')
|
||||
.replace(/&/g, '&').replace(/"/g, '"');
|
||||
const srcMatch = decoded.match(/src=["']([^"']+)["']/);
|
||||
if (srcMatch) videos.push(srcMatch[1]);
|
||||
}
|
||||
|
||||
// Also find direct iframe embeds
|
||||
const iframeRegex = /<iframe[^>]*src=["']([^"']+)["'][^>]*>/gi;
|
||||
let iframeMatch;
|
||||
while ((iframeMatch = iframeRegex.exec(html)) !== null) {
|
||||
if (!videos.includes(iframeMatch[1])) videos.push(iframeMatch[1]);
|
||||
}
|
||||
|
||||
// Now do the HTML-to-markdown conversion
|
||||
// Remove all Squarespace-specific wrapper divs, keeping inner content
|
||||
// Remove style blocks
|
||||
md = md.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '');
|
||||
|
||||
// Remove script blocks
|
||||
md = md.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
|
||||
|
||||
// Handle Squarespace image blocks - replace with clean markdown images
|
||||
// Match the noscript img tags which have the actual src
|
||||
md = md.replace(/<noscript><img[^>]*src="([^"]+)"[^>]*\/?><\/noscript>/gi, (m, src) => {
|
||||
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
||||
return `\n\n\n\n`;
|
||||
});
|
||||
|
||||
// Handle remaining img tags with data-src (lazy loaded)
|
||||
md = md.replace(/<img[^>]*data-src="([^"]+)"[^>]*>/gi, (m, src) => {
|
||||
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
||||
return `\n\n\n\n`;
|
||||
});
|
||||
|
||||
// Handle regular img tags
|
||||
md = md.replace(/<img[^>]*src="([^"]+)"[^>]*>/gi, (m, src) => {
|
||||
// Skip tracking pixels and tiny images
|
||||
if (src.includes('static1.squarespace.com/static') && !src.includes('content/')) return '';
|
||||
const cleanSrc = src.startsWith('//') ? `https:${src}` : src;
|
||||
return `\n\n\n\n`;
|
||||
});
|
||||
|
||||
// Handle video embeds
|
||||
md = md.replace(/<div[^>]*class="[^"]*sqs-video-wrapper[^"]*"[^>]*data-html="([^"]*)"[^>]*>[\s\S]*?<\/div>/gi, (m, encoded) => {
|
||||
const decoded = encoded.replace(/</g, '<').replace(/>/g, '>').replace(/&/g, '&').replace(/"/g, '"');
|
||||
const srcMatch = decoded.match(/src=["']([^"']+)["']/);
|
||||
if (srcMatch) {
|
||||
return `\n\n<iframe width="560" height="315" src="${srcMatch[1]}" frameBorder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowFullScreen></iframe>\n\n`;
|
||||
}
|
||||
return '';
|
||||
});
|
||||
|
||||
// Handle iframes
|
||||
md = md.replace(/<iframe[^>]*src="([^"]+)"[^>]*>[\s\S]*?<\/iframe>/gi, (m, src) => {
|
||||
return `\n\n<iframe width="560" height="315" src="${src}" frameBorder="0" allowFullScreen></iframe>\n\n`;
|
||||
});
|
||||
|
||||
// Headers
|
||||
md = md.replace(/<h1[^>]*>([\s\S]*?)<\/h1>/gi, '\n\n# $1\n\n');
|
||||
md = md.replace(/<h2[^>]*>([\s\S]*?)<\/h2>/gi, '\n\n## $1\n\n');
|
||||
md = md.replace(/<h3[^>]*>([\s\S]*?)<\/h3>/gi, '\n\n### $1\n\n');
|
||||
md = md.replace(/<h4[^>]*>([\s\S]*?)<\/h4>/gi, '\n\n#### $1\n\n');
|
||||
|
||||
// Links
|
||||
md = md.replace(/<a[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/gi, '[$2]($1)');
|
||||
|
||||
// Bold/italic
|
||||
md = md.replace(/<strong[^>]*>([\s\S]*?)<\/strong>/gi, '**$1**');
|
||||
md = md.replace(/<b[^>]*>([\s\S]*?)<\/b>/gi, '**$1**');
|
||||
md = md.replace(/<em[^>]*>([\s\S]*?)<\/em>/gi, '*$1*');
|
||||
|
||||
// Code
|
||||
md = md.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, '`$1`');
|
||||
md = md.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, '\n\n```\n$1\n```\n\n');
|
||||
|
||||
// Lists
|
||||
md = md.replace(/<\/li>\s*<li[^>]*>/gi, '</li>\n<li>');
|
||||
md = md.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, '- $1');
|
||||
md = md.replace(/<\/?[uo]l[^>]*>/gi, '\n');
|
||||
|
||||
// Paragraphs
|
||||
md = md.replace(/<p[^>]*>([\s\S]*?)<\/p>/gi, '\n\n$1\n\n');
|
||||
md = md.replace(/<br\s*\/?>/gi, '\n');
|
||||
md = md.replace(/<hr\s*\/?>/gi, '\n\n---\n\n');
|
||||
|
||||
// Blockquotes
|
||||
md = md.replace(/<blockquote[^>]*>([\s\S]*?)<\/blockquote>/gi, '\n\n> $1\n\n');
|
||||
|
||||
// Remove all remaining HTML tags
|
||||
md = md.replace(/<[^>]+>/g, '');
|
||||
|
||||
// Decode entities
|
||||
md = md.replace(/&/g, '&');
|
||||
md = md.replace(/</g, '<');
|
||||
md = md.replace(/>/g, '>');
|
||||
md = md.replace(/"/g, '"');
|
||||
md = md.replace(/'/g, "'");
|
||||
md = md.replace(/ /g, ' ');
|
||||
md = md.replace(/'/g, "'");
|
||||
md = md.replace(///g, '/');
|
||||
|
||||
// Remove "BACK TO POSTS" link that Squarespace adds
|
||||
md = md.replace(/\[?\s*\*?\*?BACK TO POSTS\*?\*?\s*\]?\s*\([^)]*\)/gi, '');
|
||||
|
||||
// Clean up excessive whitespace
|
||||
md = md.replace(/[ \t]+$/gm, ''); // trailing spaces
|
||||
md = md.replace(/^\s+$/gm, ''); // whitespace-only lines -> empty
|
||||
md = md.replace(/\n{3,}/g, '\n\n'); // max 2 newlines
|
||||
md = md.trim();
|
||||
|
||||
return md;
|
||||
}
|
||||
|
||||
function getCleanExcerpt(item) {
|
||||
// Use Squarespace excerpt if available
|
||||
if (item.excerpt) {
|
||||
return item.excerpt.replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim().substring(0, 250);
|
||||
}
|
||||
// Fallback: extract from body
|
||||
const text = (item.body || '').replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
|
||||
return text.substring(0, 250);
|
||||
}
|
||||
|
||||
async function downloadImage(url, filename) {
|
||||
try {
|
||||
const filepath = path.join(IMAGE_DIR, filename);
|
||||
if (fs.existsSync(filepath)) return `/images/resources/${filename}`;
|
||||
|
||||
const res = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0' } });
|
||||
if (!res.ok) return null;
|
||||
|
||||
const buffer = Buffer.from(await res.arrayBuffer());
|
||||
if (buffer.length < 100) return null; // skip tiny/empty files
|
||||
fs.writeFileSync(filepath, buffer);
|
||||
return `/images/resources/${filename}`;
|
||||
} catch { return null; }
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('Fetching sitemap...');
|
||||
const sitemapRes = await fetch('https://www.micromelon.com.au/sitemap.xml', {
|
||||
headers: { 'User-Agent': 'Mozilla/5.0' },
|
||||
});
|
||||
const sitemapXml = await sitemapRes.text();
|
||||
const urls = parseResourceUrls(sitemapXml);
|
||||
console.log(`Found ${urls.length} resource posts.\n`);
|
||||
|
||||
// Clean out old files
|
||||
for (const f of fs.readdirSync(CONTENT_DIR)) {
|
||||
if (f.endsWith('.mdx')) fs.unlinkSync(path.join(CONTENT_DIR, f));
|
||||
}
|
||||
|
||||
let count = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const url of urls) {
|
||||
const slug = url.split('/resources/')[1];
|
||||
if (!slug) continue;
|
||||
|
||||
count++;
|
||||
process.stdout.write(`[${count}/${urls.length}] ${slug}...`);
|
||||
|
||||
const data = await fetchPageJson(url);
|
||||
if (!data || !data.item) {
|
||||
console.log(' SKIP (no JSON)');
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const item = data.item;
|
||||
const title = (item.title || slug).replace(/"/g, "'");
|
||||
const date = item.publishOn
|
||||
? new Date(item.publishOn).toISOString().split('T')[0]
|
||||
: new Date(item.addedOn || Date.now()).toISOString().split('T')[0];
|
||||
const categories = item.categories || [];
|
||||
const tags = item.tags || [];
|
||||
const excerpt = getCleanExcerpt(item).replace(/"/g, "'");
|
||||
|
||||
// Featured image
|
||||
let featuredImage = '';
|
||||
if (item.assetUrl) {
|
||||
const imgUrl = item.assetUrl.startsWith('//') ? `https:${item.assetUrl}` : item.assetUrl;
|
||||
const ext = path.extname(imgUrl.split('?')[0]) || '.jpg';
|
||||
const localPath = await downloadImage(imgUrl, `${slug}${ext}`);
|
||||
if (localPath) featuredImage = localPath;
|
||||
}
|
||||
|
||||
// Convert body
|
||||
const content = sqsBodyToMarkdown(item.body || '');
|
||||
|
||||
// Build frontmatter
|
||||
const fm = [
|
||||
'---',
|
||||
`title: "${title}"`,
|
||||
`date: "${date}"`,
|
||||
`categories: [${categories.map(c => `"${c}"`).join(', ')}]`,
|
||||
`tags: [${tags.map(t => `"${t}"`).join(', ')}]`,
|
||||
`excerpt: "${excerpt}"`,
|
||||
];
|
||||
if (featuredImage) fm.push(`featuredImage: "${featuredImage}"`);
|
||||
fm.push('---');
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(CONTENT_DIR, `${slug}.mdx`),
|
||||
fm.join('\n') + '\n\n' + content + '\n'
|
||||
);
|
||||
|
||||
console.log(` OK (${content.length} chars)`);
|
||||
await new Promise(r => setTimeout(r, 300));
|
||||
}
|
||||
|
||||
const files = fs.readdirSync(CONTENT_DIR).filter(f => f.endsWith('.mdx'));
|
||||
const images = fs.readdirSync(IMAGE_DIR);
|
||||
console.log(`\nDone! ${files.length} MDX files, ${images.length} images. Skipped: ${skipped}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user