496 lines
14 KiB
TypeScript
496 lines
14 KiB
TypeScript
// based on:
|
|
// https://github.com/supabase-community/nextjs-openai-doc-search/blob/main/lib/generate-embeddings.ts
|
|
|
|
import { createClient } from '@supabase/supabase-js';
|
|
import { config as loadDotEnvFile } from 'dotenv';
|
|
import { expand } from 'dotenv-expand';
|
|
import { readFile } from 'fs/promises';
|
|
import 'openai';
|
|
import OpenAI from 'openai';
|
|
import yargs from 'yargs';
|
|
import { createHash } from 'crypto';
|
|
import GithubSlugger from 'github-slugger';
|
|
import { fromMarkdown } from 'mdast-util-from-markdown';
|
|
import { toMarkdown } from 'mdast-util-to-markdown';
|
|
import { toString } from 'mdast-util-to-string';
|
|
import { u } from 'unist-builder';
|
|
import mapJson from '../../../../docs/map.json' assert { type: 'json' };
|
|
import manifestsCI from '../../../../docs/generated/manifests/ci.json' assert { type: 'json' };
|
|
import manifestsExtending from '../../../../docs/generated/manifests/extending-nx.json' assert { type: 'json' };
|
|
import manifestsNx from '../../../../docs/generated/manifests/nx.json' assert { type: 'json' };
|
|
import manifestsPackages from '../../../../docs/generated/manifests/nx-api.json' assert { type: 'json' };
|
|
import manifestsTags from '../../../../docs/generated/manifests/tags.json' assert { type: 'json' };
|
|
import communityPlugins from '../../../../community/approved-plugins.json' assert { type: 'json' };
|
|
|
|
let identityMap = {};
|
|
|
|
const myEnv = loadDotEnvFile();
|
|
expand(myEnv);
|
|
|
|
type ProcessedMdx = {
|
|
checksum: string;
|
|
sections: Section[];
|
|
};
|
|
|
|
type Section = {
|
|
content: string;
|
|
heading?: string;
|
|
slug?: string;
|
|
};
|
|
|
|
/**
|
|
* Splits a `mdast` tree into multiple trees based on
|
|
* a predicate function. Will include the splitting node
|
|
* at the beginning of each tree.
|
|
*
|
|
* Useful to split a markdown file into smaller sections.
|
|
*/
|
|
export function splitTreeBy(tree: any, predicate: (node: any) => boolean) {
|
|
return tree.children.reduce((trees: any, node: any) => {
|
|
const [lastTree] = trees.slice(-1);
|
|
|
|
if (!lastTree || predicate(node)) {
|
|
const tree = u('root', [node]);
|
|
return trees.concat(tree);
|
|
}
|
|
|
|
lastTree.children.push(node);
|
|
return trees;
|
|
}, []);
|
|
}
|
|
|
|
/**
|
|
* Processes MD content for search indexing.
|
|
* It extracts metadata and splits it into sub-sections based on criteria.
|
|
*/
|
|
export function processMdxForSearch(content: string): ProcessedMdx {
|
|
const checksum = createHash('sha256').update(content).digest('base64');
|
|
|
|
const mdTree = fromMarkdown(content, {});
|
|
|
|
if (!mdTree) {
|
|
return {
|
|
checksum,
|
|
sections: [],
|
|
};
|
|
}
|
|
|
|
const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading');
|
|
|
|
const slugger = new GithubSlugger();
|
|
|
|
const sections = sectionTrees.map((tree: any) => {
|
|
const [firstNode] = tree.children;
|
|
|
|
const heading =
|
|
firstNode.type === 'heading' ? toString(firstNode) : undefined;
|
|
const slug = heading ? slugger.slug(heading) : undefined;
|
|
|
|
return {
|
|
content: toMarkdown(tree),
|
|
heading,
|
|
slug,
|
|
};
|
|
});
|
|
|
|
return {
|
|
checksum,
|
|
sections,
|
|
};
|
|
}
|
|
|
|
type WalkEntry = {
|
|
path: string;
|
|
url_partial: string;
|
|
};
|
|
|
|
abstract class BaseEmbeddingSource {
|
|
checksum?: string;
|
|
sections?: Section[];
|
|
|
|
constructor(
|
|
public source: string,
|
|
public path: string,
|
|
public url_partial: string
|
|
) {}
|
|
|
|
abstract load(): Promise<{
|
|
checksum: string;
|
|
sections: Section[];
|
|
}>;
|
|
}
|
|
|
|
class MarkdownEmbeddingSource extends BaseEmbeddingSource {
|
|
type: 'markdown' = 'markdown';
|
|
|
|
constructor(
|
|
source: string,
|
|
public filePath: string,
|
|
public url_partial: string,
|
|
public fileContent?: string
|
|
) {
|
|
const path = filePath.replace(/^docs/, '').replace(/\.md?$/, '');
|
|
super(source, path, url_partial);
|
|
}
|
|
|
|
async load() {
|
|
const contents =
|
|
this.fileContent ?? (await readFile(this.filePath, 'utf8'));
|
|
|
|
const { checksum, sections } = processMdxForSearch(contents);
|
|
|
|
this.checksum = checksum;
|
|
this.sections = sections;
|
|
|
|
return {
|
|
checksum,
|
|
sections,
|
|
};
|
|
}
|
|
}
|
|
|
|
type EmbeddingSource = MarkdownEmbeddingSource;
|
|
|
|
async function generateEmbeddings() {
|
|
const argv = await yargs().option('refresh', {
|
|
alias: 'r',
|
|
description: 'Refresh data',
|
|
type: 'boolean',
|
|
}).argv;
|
|
|
|
const shouldRefresh = argv.refresh;
|
|
|
|
if (!process.env.NX_NEXT_PUBLIC_SUPABASE_URL) {
|
|
throw new Error(
|
|
'Environment variable NX_NEXT_PUBLIC_SUPABASE_URL is required: skipping embeddings generation'
|
|
);
|
|
}
|
|
|
|
if (!process.env.NX_SUPABASE_SERVICE_ROLE_KEY) {
|
|
throw new Error(
|
|
'Environment variable NX_SUPABASE_SERVICE_ROLE_KEY is required: skipping embeddings generation'
|
|
);
|
|
}
|
|
|
|
if (!process.env.NX_OPENAI_KEY) {
|
|
throw new Error(
|
|
'Environment variable NX_OPENAI_KEY is required: skipping embeddings generation'
|
|
);
|
|
}
|
|
|
|
const supabaseClient = createClient(
|
|
process.env.NX_NEXT_PUBLIC_SUPABASE_URL,
|
|
process.env.NX_SUPABASE_SERVICE_ROLE_KEY,
|
|
{
|
|
auth: {
|
|
persistSession: false,
|
|
autoRefreshToken: false,
|
|
},
|
|
}
|
|
);
|
|
|
|
// Ensures that indentityMap gets populated first
|
|
let allFilesPaths = [...getAllFilesWithItemList(manifestsNx)];
|
|
|
|
allFilesPaths = [
|
|
...allFilesPaths,
|
|
...getAllFilesFromMapJson(mapJson),
|
|
...getAllFilesWithItemList(manifestsCI),
|
|
...getAllFilesWithItemList(manifestsExtending),
|
|
...getAllFilesWithItemList(manifestsPackages),
|
|
...getAllFilesWithItemList(manifestsTags),
|
|
].filter(
|
|
(entry) =>
|
|
!entry.path.includes('sitemap') && !entry.path.includes('deprecated')
|
|
);
|
|
|
|
const embeddingSources: EmbeddingSource[] = [
|
|
...allFilesPaths.map((entry) => {
|
|
return new MarkdownEmbeddingSource(
|
|
'guide',
|
|
entry.path,
|
|
entry.url_partial
|
|
);
|
|
}),
|
|
...createMarkdownForCommunityPlugins().map((content, index) => {
|
|
return new MarkdownEmbeddingSource(
|
|
'community-plugins',
|
|
'/community/approved-plugins.json#' + index,
|
|
content.url,
|
|
content.text
|
|
);
|
|
}),
|
|
];
|
|
|
|
console.log(`Discovered ${embeddingSources.length} pages`);
|
|
|
|
if (!shouldRefresh) {
|
|
console.log('Checking which pages are new or have changed');
|
|
} else {
|
|
console.log('Refresh flag set, re-generating all pages');
|
|
}
|
|
|
|
for (const [index, embeddingSource] of embeddingSources.entries()) {
|
|
const { type, source, path, url_partial } = embeddingSource;
|
|
|
|
try {
|
|
const { checksum, sections } = await embeddingSource.load();
|
|
|
|
// Check for existing page in DB and compare checksums
|
|
const { error: fetchPageError, data: existingPage } = await supabaseClient
|
|
.from('nods_page')
|
|
.select('id, path, checksum')
|
|
.filter('path', 'eq', path)
|
|
.limit(1)
|
|
.maybeSingle();
|
|
|
|
if (fetchPageError) {
|
|
throw fetchPageError;
|
|
}
|
|
|
|
// We use checksum to determine if this page & its sections need to be regenerated
|
|
if (!shouldRefresh && existingPage?.checksum === checksum) {
|
|
continue;
|
|
}
|
|
|
|
if (existingPage) {
|
|
if (!shouldRefresh) {
|
|
console.log(
|
|
`#${index}: [${path}] Docs have changed, removing old page sections and their embeddings`
|
|
);
|
|
} else {
|
|
console.log(
|
|
`#${index}: [${path}] Refresh flag set, removing old page sections and their embeddings`
|
|
);
|
|
}
|
|
|
|
const { error: deletePageSectionError } = await supabaseClient
|
|
.from('nods_page_section')
|
|
.delete()
|
|
.filter('page_id', 'eq', existingPage.id);
|
|
|
|
if (deletePageSectionError) {
|
|
throw deletePageSectionError;
|
|
}
|
|
}
|
|
|
|
// Create/update page record. Intentionally clear checksum until we
|
|
// have successfully generated all page sections.
|
|
const { error: upsertPageError, data: page } = await supabaseClient
|
|
.from('nods_page')
|
|
.upsert(
|
|
{
|
|
checksum: null,
|
|
path,
|
|
url_partial,
|
|
type,
|
|
source,
|
|
},
|
|
{ onConflict: 'path' }
|
|
)
|
|
.select()
|
|
.limit(1)
|
|
.single();
|
|
|
|
if (upsertPageError) {
|
|
throw upsertPageError;
|
|
}
|
|
|
|
console.log(
|
|
`#${index}: [${path}] Adding ${sections.length} page sections (with embeddings)`
|
|
);
|
|
console.log(
|
|
`${embeddingSources.length - index - 1} pages remaining to process.`
|
|
);
|
|
|
|
for (const { slug, heading, content } of sections) {
|
|
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
|
|
const input = content.replace(/\n/g, ' ');
|
|
|
|
try {
|
|
const openai = new OpenAI({
|
|
apiKey: process.env.NX_OPENAI_KEY,
|
|
});
|
|
const embeddingResponse = await openai.embeddings.create({
|
|
model: 'text-embedding-ada-002',
|
|
input,
|
|
});
|
|
|
|
const [responseData] = embeddingResponse.data;
|
|
|
|
const longer_heading =
|
|
source !== 'community-plugins'
|
|
? createLongerHeading(heading, url_partial)
|
|
: heading;
|
|
|
|
const { error: insertPageSectionError, data: pageSection } =
|
|
await supabaseClient
|
|
.from('nods_page_section')
|
|
.insert({
|
|
page_id: page.id,
|
|
slug,
|
|
heading:
|
|
heading?.length && heading !== null && heading !== 'null'
|
|
? heading
|
|
: longer_heading,
|
|
longer_heading,
|
|
content,
|
|
url_partial,
|
|
token_count: embeddingResponse.usage.total_tokens,
|
|
embedding: responseData.embedding,
|
|
})
|
|
.select()
|
|
.limit(1)
|
|
.single();
|
|
|
|
if (insertPageSectionError) {
|
|
throw insertPageSectionError;
|
|
}
|
|
|
|
// Add delay after each request
|
|
await delay(500); // delay of 0.5 second
|
|
} catch (err) {
|
|
// TODO: decide how to better handle failed embeddings
|
|
console.error(
|
|
`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
|
|
0,
|
|
40
|
|
)}...'`
|
|
);
|
|
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
// Set page checksum so that we know this page was stored successfully
|
|
const { error: updatePageError } = await supabaseClient
|
|
.from('nods_page')
|
|
.update({ checksum })
|
|
.filter('id', 'eq', page.id);
|
|
|
|
if (updatePageError) {
|
|
throw updatePageError;
|
|
}
|
|
} catch (err) {
|
|
console.error(
|
|
`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
|
|
);
|
|
console.error(err);
|
|
}
|
|
}
|
|
|
|
console.log('Embedding generation complete');
|
|
}
|
|
|
|
function delay(ms: number) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function getAllFilesFromMapJson(doc): WalkEntry[] {
|
|
const files: WalkEntry[] = [];
|
|
function traverse(itemList) {
|
|
for (const item of itemList) {
|
|
if (item.file && item.file.length > 0) {
|
|
// we can exclude some docs here, eg. the deprecated ones
|
|
// the path is the relative path to the file within the nx repo
|
|
// the url_partial is the relative path to the file within the docs site - under nx.dev
|
|
files.push({
|
|
path: `docs/${item.file}.md`,
|
|
url_partial: identityMap[item.id]?.path || '',
|
|
});
|
|
}
|
|
|
|
if (item.itemList) {
|
|
traverse(item.itemList);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const item of doc.content) {
|
|
traverse([item]);
|
|
}
|
|
return files;
|
|
}
|
|
|
|
function getAllFilesWithItemList(data): WalkEntry[] {
|
|
const files: WalkEntry[] = [];
|
|
|
|
function traverse(itemList) {
|
|
for (const item of itemList) {
|
|
if (item.file && item.file.length > 0) {
|
|
// the path is the relative path to the file within the nx repo
|
|
// the url_partial is the relative path to the file within the docs site - under nx.dev
|
|
files.push({ path: `docs/${item.file}.md`, url_partial: item.path });
|
|
if (!identityMap[item.id]) {
|
|
identityMap = { ...identityMap, [item.id]: item };
|
|
}
|
|
}
|
|
|
|
if (item.itemList) {
|
|
traverse(item.itemList);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const key in data) {
|
|
if (data[key].itemList) {
|
|
traverse([data[key]]);
|
|
} else {
|
|
if (data[key].documents) {
|
|
files.push(...getAllFilesWithItemList(data[key].documents));
|
|
}
|
|
if (data[key].generators) {
|
|
files.push(...getAllFilesWithItemList(data[key].generators));
|
|
}
|
|
if (data[key].executors) {
|
|
files.push(...getAllFilesWithItemList(data[key].executors));
|
|
}
|
|
if (data[key]?.length > 0) {
|
|
traverse(data[key]);
|
|
}
|
|
}
|
|
}
|
|
return files;
|
|
}
|
|
|
|
function createLongerHeading(
|
|
heading?: string | null,
|
|
url_partial?: string
|
|
): string | undefined {
|
|
if (url_partial?.length) {
|
|
if (heading?.length && heading !== null && heading !== 'null') {
|
|
return `${heading}${` - ${
|
|
url_partial.split('/')?.[1]?.[0]?.toUpperCase() +
|
|
url_partial.split('/')?.[1]?.slice(1)
|
|
}`}`;
|
|
} else {
|
|
return url_partial
|
|
.split('#')[0]
|
|
.split('/')
|
|
.map((part) =>
|
|
part?.length ? part[0]?.toUpperCase() + part.slice(1) + ' - ' : ''
|
|
)
|
|
.join('')
|
|
.slice(0, -3);
|
|
}
|
|
}
|
|
}
|
|
|
|
function createMarkdownForCommunityPlugins(): {
|
|
text: string;
|
|
url: string;
|
|
}[] {
|
|
return communityPlugins.map((plugin) => {
|
|
return {
|
|
text: `## ${plugin.name} plugin\n\nThere is a ${plugin.name} community plugin.\n\nHere is the description for it: ${plugin.description}\n\nHere is the link to it: [${plugin.url}](${plugin.url})\n\nHere is the list of all the plugins that exist for Nx: https://nx.dev/plugin-registry`,
|
|
url: plugin.url,
|
|
};
|
|
});
|
|
}
|
|
|
|
async function main() {
|
|
await generateEmbeddings();
|
|
}
|
|
|
|
main().catch((err) => console.error(err));
|