496 lines
14 KiB
TypeScript

// based on:
// https://github.com/supabase-community/nextjs-openai-doc-search/blob/main/lib/generate-embeddings.ts
import { createClient } from '@supabase/supabase-js';
import { config as loadDotEnvFile } from 'dotenv';
import { expand } from 'dotenv-expand';
import { readFile } from 'fs/promises';
import 'openai';
import OpenAI from 'openai';
import yargs from 'yargs';
import { createHash } from 'crypto';
import GithubSlugger from 'github-slugger';
import { fromMarkdown } from 'mdast-util-from-markdown';
import { toMarkdown } from 'mdast-util-to-markdown';
import { toString } from 'mdast-util-to-string';
import { u } from 'unist-builder';
import mapJson from '../../../../docs/map.json' assert { type: 'json' };
import manifestsCI from '../../../../docs/generated/manifests/ci.json' assert { type: 'json' };
import manifestsExtending from '../../../../docs/generated/manifests/extending-nx.json' assert { type: 'json' };
import manifestsNx from '../../../../docs/generated/manifests/nx.json' assert { type: 'json' };
import manifestsPackages from '../../../../docs/generated/manifests/nx-api.json' assert { type: 'json' };
import manifestsTags from '../../../../docs/generated/manifests/tags.json' assert { type: 'json' };
import communityPlugins from '../../../../community/approved-plugins.json' assert { type: 'json' };
let identityMap = {};
const myEnv = loadDotEnvFile();
expand(myEnv);
type ProcessedMdx = {
checksum: string;
sections: Section[];
};
type Section = {
content: string;
heading?: string;
slug?: string;
};
/**
* Splits a `mdast` tree into multiple trees based on
* a predicate function. Will include the splitting node
* at the beginning of each tree.
*
* Useful to split a markdown file into smaller sections.
*/
export function splitTreeBy(tree: any, predicate: (node: any) => boolean) {
return tree.children.reduce((trees: any, node: any) => {
const [lastTree] = trees.slice(-1);
if (!lastTree || predicate(node)) {
const tree = u('root', [node]);
return trees.concat(tree);
}
lastTree.children.push(node);
return trees;
}, []);
}
/**
* Processes MD content for search indexing.
* It extracts metadata and splits it into sub-sections based on criteria.
*/
export function processMdxForSearch(content: string): ProcessedMdx {
const checksum = createHash('sha256').update(content).digest('base64');
const mdTree = fromMarkdown(content, {});
if (!mdTree) {
return {
checksum,
sections: [],
};
}
const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading');
const slugger = new GithubSlugger();
const sections = sectionTrees.map((tree: any) => {
const [firstNode] = tree.children;
const heading =
firstNode.type === 'heading' ? toString(firstNode) : undefined;
const slug = heading ? slugger.slug(heading) : undefined;
return {
content: toMarkdown(tree),
heading,
slug,
};
});
return {
checksum,
sections,
};
}
type WalkEntry = {
path: string;
url_partial: string;
};
abstract class BaseEmbeddingSource {
checksum?: string;
sections?: Section[];
constructor(
public source: string,
public path: string,
public url_partial: string
) {}
abstract load(): Promise<{
checksum: string;
sections: Section[];
}>;
}
class MarkdownEmbeddingSource extends BaseEmbeddingSource {
type: 'markdown' = 'markdown';
constructor(
source: string,
public filePath: string,
public url_partial: string,
public fileContent?: string
) {
const path = filePath.replace(/^docs/, '').replace(/\.md?$/, '');
super(source, path, url_partial);
}
async load() {
const contents =
this.fileContent ?? (await readFile(this.filePath, 'utf8'));
const { checksum, sections } = processMdxForSearch(contents);
this.checksum = checksum;
this.sections = sections;
return {
checksum,
sections,
};
}
}
type EmbeddingSource = MarkdownEmbeddingSource;
async function generateEmbeddings() {
const argv = await yargs().option('refresh', {
alias: 'r',
description: 'Refresh data',
type: 'boolean',
}).argv;
const shouldRefresh = argv.refresh;
if (!process.env.NX_NEXT_PUBLIC_SUPABASE_URL) {
throw new Error(
'Environment variable NX_NEXT_PUBLIC_SUPABASE_URL is required: skipping embeddings generation'
);
}
if (!process.env.NX_SUPABASE_SERVICE_ROLE_KEY) {
throw new Error(
'Environment variable NX_SUPABASE_SERVICE_ROLE_KEY is required: skipping embeddings generation'
);
}
if (!process.env.NX_OPENAI_KEY) {
throw new Error(
'Environment variable NX_OPENAI_KEY is required: skipping embeddings generation'
);
}
const supabaseClient = createClient(
process.env.NX_NEXT_PUBLIC_SUPABASE_URL,
process.env.NX_SUPABASE_SERVICE_ROLE_KEY,
{
auth: {
persistSession: false,
autoRefreshToken: false,
},
}
);
// Ensures that indentityMap gets populated first
let allFilesPaths = [...getAllFilesWithItemList(manifestsNx)];
allFilesPaths = [
...allFilesPaths,
...getAllFilesFromMapJson(mapJson),
...getAllFilesWithItemList(manifestsCI),
...getAllFilesWithItemList(manifestsExtending),
...getAllFilesWithItemList(manifestsPackages),
...getAllFilesWithItemList(manifestsTags),
].filter(
(entry) =>
!entry.path.includes('sitemap') && !entry.path.includes('deprecated')
);
const embeddingSources: EmbeddingSource[] = [
...allFilesPaths.map((entry) => {
return new MarkdownEmbeddingSource(
'guide',
entry.path,
entry.url_partial
);
}),
...createMarkdownForCommunityPlugins().map((content, index) => {
return new MarkdownEmbeddingSource(
'community-plugins',
'/community/approved-plugins.json#' + index,
content.url,
content.text
);
}),
];
console.log(`Discovered ${embeddingSources.length} pages`);
if (!shouldRefresh) {
console.log('Checking which pages are new or have changed');
} else {
console.log('Refresh flag set, re-generating all pages');
}
for (const [index, embeddingSource] of embeddingSources.entries()) {
const { type, source, path, url_partial } = embeddingSource;
try {
const { checksum, sections } = await embeddingSource.load();
// Check for existing page in DB and compare checksums
const { error: fetchPageError, data: existingPage } = await supabaseClient
.from('nods_page')
.select('id, path, checksum')
.filter('path', 'eq', path)
.limit(1)
.maybeSingle();
if (fetchPageError) {
throw fetchPageError;
}
// We use checksum to determine if this page & its sections need to be regenerated
if (!shouldRefresh && existingPage?.checksum === checksum) {
continue;
}
if (existingPage) {
if (!shouldRefresh) {
console.log(
`#${index}: [${path}] Docs have changed, removing old page sections and their embeddings`
);
} else {
console.log(
`#${index}: [${path}] Refresh flag set, removing old page sections and their embeddings`
);
}
const { error: deletePageSectionError } = await supabaseClient
.from('nods_page_section')
.delete()
.filter('page_id', 'eq', existingPage.id);
if (deletePageSectionError) {
throw deletePageSectionError;
}
}
// Create/update page record. Intentionally clear checksum until we
// have successfully generated all page sections.
const { error: upsertPageError, data: page } = await supabaseClient
.from('nods_page')
.upsert(
{
checksum: null,
path,
url_partial,
type,
source,
},
{ onConflict: 'path' }
)
.select()
.limit(1)
.single();
if (upsertPageError) {
throw upsertPageError;
}
console.log(
`#${index}: [${path}] Adding ${sections.length} page sections (with embeddings)`
);
console.log(
`${embeddingSources.length - index - 1} pages remaining to process.`
);
for (const { slug, heading, content } of sections) {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = content.replace(/\n/g, ' ');
try {
const openai = new OpenAI({
apiKey: process.env.NX_OPENAI_KEY,
});
const embeddingResponse = await openai.embeddings.create({
model: 'text-embedding-ada-002',
input,
});
const [responseData] = embeddingResponse.data;
const longer_heading =
source !== 'community-plugins'
? createLongerHeading(heading, url_partial)
: heading;
const { error: insertPageSectionError, data: pageSection } =
await supabaseClient
.from('nods_page_section')
.insert({
page_id: page.id,
slug,
heading:
heading?.length && heading !== null && heading !== 'null'
? heading
: longer_heading,
longer_heading,
content,
url_partial,
token_count: embeddingResponse.usage.total_tokens,
embedding: responseData.embedding,
})
.select()
.limit(1)
.single();
if (insertPageSectionError) {
throw insertPageSectionError;
}
// Add delay after each request
await delay(500); // delay of 0.5 second
} catch (err) {
// TODO: decide how to better handle failed embeddings
console.error(
`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
0,
40
)}...'`
);
throw err;
}
}
// Set page checksum so that we know this page was stored successfully
const { error: updatePageError } = await supabaseClient
.from('nods_page')
.update({ checksum })
.filter('id', 'eq', page.id);
if (updatePageError) {
throw updatePageError;
}
} catch (err) {
console.error(
`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
);
console.error(err);
}
}
console.log('Embedding generation complete');
}
function delay(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function getAllFilesFromMapJson(doc): WalkEntry[] {
const files: WalkEntry[] = [];
function traverse(itemList) {
for (const item of itemList) {
if (item.file && item.file.length > 0) {
// we can exclude some docs here, eg. the deprecated ones
// the path is the relative path to the file within the nx repo
// the url_partial is the relative path to the file within the docs site - under nx.dev
files.push({
path: `docs/${item.file}.md`,
url_partial: identityMap[item.id]?.path || '',
});
}
if (item.itemList) {
traverse(item.itemList);
}
}
}
for (const item of doc.content) {
traverse([item]);
}
return files;
}
function getAllFilesWithItemList(data): WalkEntry[] {
const files: WalkEntry[] = [];
function traverse(itemList) {
for (const item of itemList) {
if (item.file && item.file.length > 0) {
// the path is the relative path to the file within the nx repo
// the url_partial is the relative path to the file within the docs site - under nx.dev
files.push({ path: `docs/${item.file}.md`, url_partial: item.path });
if (!identityMap[item.id]) {
identityMap = { ...identityMap, [item.id]: item };
}
}
if (item.itemList) {
traverse(item.itemList);
}
}
}
for (const key in data) {
if (data[key].itemList) {
traverse([data[key]]);
} else {
if (data[key].documents) {
files.push(...getAllFilesWithItemList(data[key].documents));
}
if (data[key].generators) {
files.push(...getAllFilesWithItemList(data[key].generators));
}
if (data[key].executors) {
files.push(...getAllFilesWithItemList(data[key].executors));
}
if (data[key]?.length > 0) {
traverse(data[key]);
}
}
}
return files;
}
function createLongerHeading(
heading?: string | null,
url_partial?: string
): string | undefined {
if (url_partial?.length) {
if (heading?.length && heading !== null && heading !== 'null') {
return `${heading}${` - ${
url_partial.split('/')?.[1]?.[0]?.toUpperCase() +
url_partial.split('/')?.[1]?.slice(1)
}`}`;
} else {
return url_partial
.split('#')[0]
.split('/')
.map((part) =>
part?.length ? part[0]?.toUpperCase() + part.slice(1) + ' - ' : ''
)
.join('')
.slice(0, -3);
}
}
}
function createMarkdownForCommunityPlugins(): {
text: string;
url: string;
}[] {
return communityPlugins.map((plugin) => {
return {
text: `## ${plugin.name} plugin\n\nThere is a ${plugin.name} community plugin.\n\nHere is the description for it: ${plugin.description}\n\nHere is the link to it: [${plugin.url}](${plugin.url})\n\nHere is the list of all the plugins that exist for Nx: https://nx.dev/plugin-registry`,
url: plugin.url,
};
});
}
async function main() {
await generateEmbeddings();
}
main().catch((err) => console.error(err));