From a2b628f40dd6c93646f28d41f4417fb8fcabfb95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrea=20S=C3=A1nchez=20Blanco?= Date: Thu, 21 May 2026 11:51:06 +0200 Subject: [PATCH] feat(docs): split v3 llms.txt into spec-compliant index and full dump --- .../src/routes/llms-full.txt/+server.js | 86 ++++++++ .../src/routes/llms.txt/+server.js | 188 +++++++++++++----- 2 files changed, 220 insertions(+), 54 deletions(-) create mode 100644 packages/stacks-docs/src/routes/llms-full.txt/+server.js diff --git a/packages/stacks-docs/src/routes/llms-full.txt/+server.js b/packages/stacks-docs/src/routes/llms-full.txt/+server.js new file mode 100644 index 0000000000..a31790fd13 --- /dev/null +++ b/packages/stacks-docs/src/routes/llms-full.txt/+server.js @@ -0,0 +1,86 @@ +import { render } from "svelte/server"; +import TurndownService from "turndown"; + +// Full content dump used by tools that need to chunk page bodies +// (e.g. stacks-mcp-server). The spec-compliant index lives at /llms.txt. + +const turndownService = new TurndownService({ + headingStyle: "atx", + codeBlockStyle: "fenced", +}); + +export async function GET() { + const baseUrl = "https://stackoverflow.design/"; + const mdFiles = import.meta.glob("$docs/public/**/**/*.md"); + + let groupedDocs = {}; + + for (const [path, doc] of Object.entries(mdFiles)) { + const parts = path.split("/"); + const publicIndex = parts.indexOf("public"); + + // Derive the section name from the path. Files under system// + // use the sub-group name (e.g. components, base). Files outside system + // (brand, copy, resources, changelog) use the first segment under public. + let group; + if (parts[publicIndex + 1] === "system") { + group = parts[publicIndex + 2] ?? "system"; + } else { + group = parts[publicIndex + 1] ?? "general"; + } + + if (!groupedDocs[group]) { + groupedDocs[group] = []; + } + + groupedDocs[group].push({ path, doc }); + } + + // Header + let output = ` +# Site Content for LLMs +# Generated: ${new Date().toISOString()} +# Site URL: ${baseUrl} +`.trim(); + + // Loop over the sections + for (const [group, docs] of Object.entries(groupedDocs)) { + output += `\n\n## Collection: ${group}`; + + // Render the docs in the group + for (const { path, doc } of docs) { + const page = await doc(); + + output += ` + +### Page: ${page.metadata?.title ?? path} +URL: ${baseUrl}${getSlug(path)} +Date: ${new Date().toISOString()} +description: ${page.metadata?.description ?? ""} + +Content: +${turndownService.turndown(render(page.default).body)} + +--- + +`.trimEnd(); + } + } + + return new Response(output, { + headers: { + "Content-Type": "text/plain", + }, + }); +} + +function getSlug(filePath) { + let slug = filePath; + + slug = slug + .replace("/src/docs/public/", "") + .replace("index.md", "") + .replace(".md", ""); + + return slug; +} diff --git a/packages/stacks-docs/src/routes/llms.txt/+server.js b/packages/stacks-docs/src/routes/llms.txt/+server.js index 6883c5221e..6e360c8981 100644 --- a/packages/stacks-docs/src/routes/llms.txt/+server.js +++ b/packages/stacks-docs/src/routes/llms.txt/+server.js @@ -1,75 +1,155 @@ -import { render } from "svelte/server"; -import TurndownService from "turndown"; +import YAML from "yaml"; +import structureRaw from "$src/structure.yaml?raw"; -const turndownService = new TurndownService({ - headingStyle: "atx", - codeBlockStyle: "fenced", -}); +// Spec-compliant llms.txt index (see https://llmstxt.org). The full content +// dump tools like stacks-mcp-server chunk lives at /llms-full.txt. -export async function GET() { - const baseUrl = "https://stackoverflow.design/"; - const mdFiles = import.meta.glob("$docs/public/**/**/*.md"); - - let groupedDocs = {}; +const BASE_URL = "https://stackoverflow.design"; - for (const [path, doc] of Object.entries(mdFiles)) { - const parts = path.split("/"); +const SITE_TITLE = "Stacks"; +const SITE_DESCRIPTION = + "Stacks provides everything you need to quickly design, build, and ship coherent experiences across all of Stack Overflow—from the brand and product itself, down to how we send emails and write copy."; - const systemIndex = parts.indexOf("system"); - const group = parts[systemIndex + 1]; - - if (!groupedDocs[group]) { - groupedDocs[group] = []; - } +// Top-level navigation entries whose direct children should each become their +// own H2 section. Everything else collapses to a single H2 per top-level. +const FLATTENED_TOP_LEVELS = new Set(["system"]); - groupedDocs[group].push({ path, doc }); +export async function GET() { + let structure = { navigation: [] }; + try { + structure = YAML.parse(structureRaw) ?? { navigation: [] }; + } catch (err) { + console.error("Failed to parse structure.yaml:", err); } - // Header - let output = ` -# Site Content for LLMs -# Generated: ${new Date().toISOString()} -# Site URL: ${baseUrl} -`.trim(); - - // Loop over the sections - for (const [group, docs] of Object.entries(groupedDocs)) { - output += `\n\n## Collection: ${group}`; + const descriptions = await loadPageDescriptions(); + const sections = collectSections(structure.navigation ?? [], descriptions); + const output = renderOutput(sections); - // Render the docs in the group - for (const { path, doc } of docs) { - const page = await doc(); + return new Response(output, { + headers: { "Content-Type": "text/plain" }, + }); +} - output += ` +async function loadPageDescriptions() { + // Read markdown files as raw text rather than loading the compiled Svelte + // module — we only need the frontmatter description, and avoiding the + // module load keeps this independent of every page component's imports. + const mdFiles = import.meta.glob("$docs/public/**/*.md", { + query: "?raw", + import: "default", + }); + const entries = await Promise.all( + Object.entries(mdFiles).map(async ([path, doc]) => { + const raw = await doc(); + return [pathToUrl(path), parseDescription(raw)]; + }) + ); + return Object.fromEntries(entries); +} -### Page: ${page.metadata?.title ?? path} -URL: ${baseUrl}${getSlug(path)} -Date: ${new Date().toISOString()} -description: ${page.metadata?.description ?? ""} +function parseDescription(raw) { + const match = raw.match(/^---\r?\n([\s\S]*?)\r?\n---/); + if (!match) return ""; + try { + const fm = YAML.parse(match[1]) || {}; + return typeof fm.description === "string" ? fm.description : ""; + } catch { + return ""; + } +} -Content: -${turndownService.turndown(render(page.default).body)} +function collectSections(navigation, descriptions) { + const sections = []; + for (const topLevel of navigation) { + if (topLevel.private) continue; + + if (FLATTENED_TOP_LEVELS.has(topLevel.slug)) { + for (const child of topLevel.items ?? []) { + if (child.private) continue; + pushSection( + sections, + child, + [topLevel.slug, child.slug], + descriptions + ); + } + } else { + pushSection( + sections, + topLevel, + [topLevel.slug], + descriptions + ); + } + } + return sections; +} ---- +function pushSection(sections, item, basePath, descriptions) { + const links = []; + walkItems(item, basePath, descriptions, links); + if (links.length > 0) { + sections.push({ title: item.title ?? basePath.join(" / "), links }); + } +} -`.trimEnd(); - } +function walkItems(item, basePath, descriptions, links) { + const children = Array.isArray(item.items) ? item.items : []; + if (children.length === 0) { + emitLink(item, basePath, descriptions, links); + return; + } + for (const child of children) { + if (child.private) continue; + if (child.externalUrl) continue; // external links aren't site pages + walkItems(child, [...basePath, child.slug], descriptions, links); } +} - return new Response(output, { - headers: { - "Content-Type": "text/plain", - }, - }); +function emitLink(item, basePath, descriptions, links) { + const slug = basePath.join("/"); + // Folder-index pages keep their trailing slash (brand/color/index.md → + // /brand/color/); plain .md pages don't (brand/motion.md → /brand/motion). + const candidateUrls = [`/${slug}/`, `/${slug}`]; + const matchedUrl = candidateUrls.find((u) => descriptions[u] != null); + if (!matchedUrl) { + // No matching page file — skip rather than emit a dead link. + return; + } + const cleaned = cleanDescription(descriptions[matchedUrl]); + const url = `${BASE_URL}${matchedUrl}`; + const title = item.title ?? basePath[basePath.length - 1]; + const suffix = cleaned ? `: ${cleaned}` : ""; + links.push(`- [${title}](${url})${suffix}`); } -function getSlug(filePath) { - let slug = filePath; +// Descriptions may contain inline HTML for the rendered page; strip it for +// llms.txt. Loop until stable so unclosed tags can't reintroduce the pattern +// (CodeQL js/incomplete-multi-character-sanitization). +function cleanDescription(input) { + let previous; + let output = input; + do { + previous = output; + output = output.replace(/<[^>]+>/g, ""); + } while (output !== previous); + return output.replace(/\s+/g, " ").trim(); +} - slug = slug +function pathToUrl(path) { + let slug = path .replace("/src/docs/public/", "") - .replace("index.md", "") - .replace(".md", ""); + .replace(/\.md$/, ""); + if (slug.endsWith("/index")) { + return `/${slug.replace(/\/index$/, "")}/`; + } + return `/${slug}`; +} - return slug; +function renderOutput(sections) { + const body = sections + .map((s) => `## ${s.title}\n\n${s.links.join("\n")}`) + .join("\n\n"); + return `# ${SITE_TITLE}\n\n> ${SITE_DESCRIPTION}\n\n${body}\n`; }