// based on: // https://github.com/supabase-community/nextjs-openai-doc-search/blob/main/lib/generate-embeddings.ts import { createClient } from '@supabase/supabase-js'; import { config as loadDotEnvFile } from 'dotenv'; import { expand } from 'dotenv-expand'; import { readFile } from 'fs/promises'; import 'openai'; import OpenAI from 'openai'; import yargs from 'yargs'; import { createHash } from 'crypto'; import GithubSlugger from 'github-slugger'; import { fromMarkdown } from 'mdast-util-from-markdown'; import { toMarkdown } from 'mdast-util-to-markdown'; import { toString } from 'mdast-util-to-string'; import { u } from 'unist-builder'; import mapJson from '../../../../docs/map.json' assert { type: 'json' }; import manifestsCloud from '../../../../docs/generated/manifests/cloud.json' assert { type: 'json' }; import manifestsExtending from '../../../../docs/generated/manifests/extending-nx.json' assert { type: 'json' }; import manifestsNx from '../../../../docs/generated/manifests/nx.json' assert { type: 'json' }; import manifestsPackages from '../../../../docs/generated/manifests/nx-api.json' assert { type: 'json' }; import manifestsTags from '../../../../docs/generated/manifests/tags.json' assert { type: 'json' }; import communityPlugins from '../../../../community/approved-plugins.json' assert { type: 'json' }; let identityMap = {}; const myEnv = loadDotEnvFile(); expand(myEnv); type ProcessedMdx = { checksum: string; sections: Section[]; }; type Section = { content: string; heading?: string; slug?: string; }; /** * Splits a `mdast` tree into multiple trees based on * a predicate function. Will include the splitting node * at the beginning of each tree. * * Useful to split a markdown file into smaller sections. */ export function splitTreeBy(tree: any, predicate: (node: any) => boolean) { return tree.children.reduce((trees: any, node: any) => { const [lastTree] = trees.slice(-1); if (!lastTree || predicate(node)) { const tree = u('root', [node]); return trees.concat(tree); } lastTree.children.push(node); return trees; }, []); } /** * Processes MD content for search indexing. * It extracts metadata and splits it into sub-sections based on criteria. */ export function processMdxForSearch(content: string): ProcessedMdx { const checksum = createHash('sha256').update(content).digest('base64'); const mdTree = fromMarkdown(content, {}); if (!mdTree) { return { checksum, sections: [], }; } const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading'); const slugger = new GithubSlugger(); const sections = sectionTrees.map((tree: any) => { const [firstNode] = tree.children; const heading = firstNode.type === 'heading' ? toString(firstNode) : undefined; const slug = heading ? slugger.slug(heading) : undefined; return { content: toMarkdown(tree), heading, slug, }; }); return { checksum, sections, }; } type WalkEntry = { path: string; url_partial: string; }; abstract class BaseEmbeddingSource { checksum?: string; sections?: Section[]; constructor( public source: string, public path: string, public url_partial: string ) {} abstract load(): Promise<{ checksum: string; sections: Section[]; }>; } class MarkdownEmbeddingSource extends BaseEmbeddingSource { type: 'markdown' = 'markdown'; constructor( source: string, public filePath: string, public url_partial: string, public fileContent?: string ) { const path = filePath.replace(/^docs/, '').replace(/\.md?$/, ''); super(source, path, url_partial); } async load() { const contents = this.fileContent ?? (await readFile(this.filePath, 'utf8')); const { checksum, sections } = processMdxForSearch(contents); this.checksum = checksum; this.sections = sections; return { checksum, sections, }; } } type EmbeddingSource = MarkdownEmbeddingSource; async function generateEmbeddings() { const argv = await yargs().option('refresh', { alias: 'r', description: 'Refresh data', type: 'boolean', }).argv; const shouldRefresh = argv.refresh; if (!process.env.NX_NEXT_PUBLIC_SUPABASE_URL) { throw new Error( 'Environment variable NX_NEXT_PUBLIC_SUPABASE_URL is required: skipping embeddings generation' ); } if (!process.env.NX_SUPABASE_SERVICE_ROLE_KEY) { throw new Error( 'Environment variable NX_SUPABASE_SERVICE_ROLE_KEY is required: skipping embeddings generation' ); } if (!process.env.NX_OPENAI_KEY) { throw new Error( 'Environment variable NX_OPENAI_KEY is required: skipping embeddings generation' ); } const supabaseClient = createClient( process.env.NX_NEXT_PUBLIC_SUPABASE_URL, process.env.NX_SUPABASE_SERVICE_ROLE_KEY, { auth: { persistSession: false, autoRefreshToken: false, }, } ); // Ensures that indentityMap gets populated first let allFilesPaths = [...getAllFilesWithItemList(manifestsNx)]; allFilesPaths = [ ...allFilesPaths, ...getAllFilesFromMapJson(mapJson), ...getAllFilesWithItemList(manifestsCloud), ...getAllFilesWithItemList(manifestsExtending), ...getAllFilesWithItemList(manifestsPackages), ...getAllFilesWithItemList(manifestsTags), ].filter( (entry) => !entry.path.includes('sitemap') && !entry.path.includes('deprecated') ); const embeddingSources: EmbeddingSource[] = [ ...allFilesPaths.map((entry) => { return new MarkdownEmbeddingSource( 'guide', entry.path, entry.url_partial ); }), ...createMarkdownForCommunityPlugins().map((content, index) => { return new MarkdownEmbeddingSource( 'community-plugins', '/community/approved-plugins.json#' + index, content.url, content.text ); }), ]; console.log(`Discovered ${embeddingSources.length} pages`); if (!shouldRefresh) { console.log('Checking which pages are new or have changed'); } else { console.log('Refresh flag set, re-generating all pages'); } for (const [index, embeddingSource] of embeddingSources.entries()) { const { type, source, path, url_partial } = embeddingSource; try { const { checksum, sections } = await embeddingSource.load(); // Check for existing page in DB and compare checksums const { error: fetchPageError, data: existingPage } = await supabaseClient .from('nods_page') .select('id, path, checksum') .filter('path', 'eq', path) .limit(1) .maybeSingle(); if (fetchPageError) { throw fetchPageError; } // We use checksum to determine if this page & its sections need to be regenerated if (!shouldRefresh && existingPage?.checksum === checksum) { continue; } if (existingPage) { if (!shouldRefresh) { console.log( `#${index}: [${path}] Docs have changed, removing old page sections and their embeddings` ); } else { console.log( `#${index}: [${path}] Refresh flag set, removing old page sections and their embeddings` ); } const { error: deletePageSectionError } = await supabaseClient .from('nods_page_section') .delete() .filter('page_id', 'eq', existingPage.id); if (deletePageSectionError) { throw deletePageSectionError; } } // Create/update page record. Intentionally clear checksum until we // have successfully generated all page sections. const { error: upsertPageError, data: page } = await supabaseClient .from('nods_page') .upsert( { checksum: null, path, url_partial, type, source, }, { onConflict: 'path' } ) .select() .limit(1) .single(); if (upsertPageError) { throw upsertPageError; } console.log( `#${index}: [${path}] Adding ${sections.length} page sections (with embeddings)` ); console.log( `${embeddingSources.length - index - 1} pages remaining to process.` ); for (const { slug, heading, content } of sections) { // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings) const input = content.replace(/\n/g, ' '); try { const openai = new OpenAI({ apiKey: process.env.NX_OPENAI_KEY, }); const embeddingResponse = await openai.embeddings.create({ model: 'text-embedding-ada-002', input, }); const [responseData] = embeddingResponse.data; const longer_heading = source !== 'community-plugins' ? createLongerHeading(heading, url_partial) : heading; const { error: insertPageSectionError, data: pageSection } = await supabaseClient .from('nods_page_section') .insert({ page_id: page.id, slug, heading: heading?.length && heading !== null && heading !== 'null' ? heading : longer_heading, longer_heading, content, url_partial, token_count: embeddingResponse.usage.total_tokens, embedding: responseData.embedding, }) .select() .limit(1) .single(); if (insertPageSectionError) { throw insertPageSectionError; } // Add delay after each request await delay(500); // delay of 0.5 second } catch (err) { // TODO: decide how to better handle failed embeddings console.error( `Failed to generate embeddings for '${path}' page section starting with '${input.slice( 0, 40 )}...'` ); throw err; } } // Set page checksum so that we know this page was stored successfully const { error: updatePageError } = await supabaseClient .from('nods_page') .update({ checksum }) .filter('id', 'eq', page.id); if (updatePageError) { throw updatePageError; } } catch (err) { console.error( `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.` ); console.error(err); } } console.log('Embedding generation complete'); } function delay(ms: number) { return new Promise((resolve) => setTimeout(resolve, ms)); } function getAllFilesFromMapJson(doc): WalkEntry[] { const files: WalkEntry[] = []; function traverse(itemList) { for (const item of itemList) { if (item.file && item.file.length > 0) { // we can exclude some docs here, eg. the deprecated ones // the path is the relative path to the file within the nx repo // the url_partial is the relative path to the file within the docs site - under nx.dev files.push({ path: `docs/${item.file}.md`, url_partial: identityMap[item.id]?.path || '', }); } if (item.itemList) { traverse(item.itemList); } } } for (const item of doc.content) { traverse([item]); } return files; } function getAllFilesWithItemList(data): WalkEntry[] { const files: WalkEntry[] = []; function traverse(itemList) { for (const item of itemList) { if (item.file && item.file.length > 0) { // the path is the relative path to the file within the nx repo // the url_partial is the relative path to the file within the docs site - under nx.dev files.push({ path: `docs/${item.file}.md`, url_partial: item.path }); if (!identityMap[item.id]) { identityMap = { ...identityMap, [item.id]: item }; } } if (item.itemList) { traverse(item.itemList); } } } for (const key in data) { if (data[key].itemList) { traverse([data[key]]); } else { if (data[key].documents) { files.push(...getAllFilesWithItemList(data[key].documents)); } if (data[key].generators) { files.push(...getAllFilesWithItemList(data[key].generators)); } if (data[key].executors) { files.push(...getAllFilesWithItemList(data[key].executors)); } if (data[key]?.length > 0) { traverse(data[key]); } } } return files; } function createLongerHeading( heading?: string | null, url_partial?: string ): string | undefined { if (url_partial?.length) { if (heading?.length && heading !== null && heading !== 'null') { return `${heading}${` - ${ url_partial.split('/')?.[1]?.[0]?.toUpperCase() + url_partial.split('/')?.[1]?.slice(1) }`}`; } else { return url_partial .split('#')[0] .split('/') .map((part) => part?.length ? part[0]?.toUpperCase() + part.slice(1) + ' - ' : '' ) .join('') .slice(0, -3); } } } function createMarkdownForCommunityPlugins(): { text: string; url: string; }[] { return communityPlugins.map((plugin) => { return { text: `## ${plugin.name} plugin\n\nThere is a ${plugin.name} community plugin.\n\nHere is the description for it: ${plugin.description}\n\nHere is the link to it: [${plugin.url}](${plugin.url})\n\nHere is the list of all the plugins that exist for Nx: https://nx.dev/extending-nx/registry`, url: plugin.url, }; }); } async function main() { await generateEmbeddings(); } main().catch((err) => console.error(err));