isle/code_embedder.ts

105 lines
2.9 KiB
TypeScript

import { config } from "dotenv";
import { createHash } from "node:crypto";
import { readFileSync, readdirSync, statSync } from "node:fs";
import path from "node:path";
import { z } from "zod";
import { OpenAI } from "openai";
import { createClient } from "@libsql/client";
// configuration
const DB_PATH = "file:code_embeddings.db";
const SOURCE_FILE_EXTENSIONS = [".c", ".cpp", ".h", ".hpp", ".hxx"];
const OPENAI_MODEL = "text-embedding-3-large";
const EMBEDDING_DIMENSION = 3072;
// env
config();
const EnvSchema = z.object({ OPENAI_API_KEY: z.string() });
const { OPENAI_API_KEY } = EnvSchema.parse(process.env);
// clients
const openai = new OpenAI({ apiKey: OPENAI_API_KEY });
const db = createClient({ url: DB_PATH });
await db.execute(
`CREATE TABLE IF NOT EXISTS file_embeddings (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file_path TEXT UNIQUE NOT NULL,
checksum TEXT NOT NULL,
embedding F32_BLOB(${EMBEDDING_DIMENSION}) NOT NULL,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)`,
);
const getChecksum = (filePath: string): string => {
const hash = createHash("sha256");
hash.update(readFileSync(filePath));
return hash.digest("hex");
};
const getEmbedding = async (content: string): Promise<number[]> => {
const { data } = await openai.embeddings.create({
model: OPENAI_MODEL,
input: content,
encoding_format: "float",
});
return data[0].embedding as unknown as number[];
};
const findFiles = (startDir: string): string[] => {
const stack = [startDir];
const files: string[] = [];
while (stack.length) {
const current = stack.pop() as string;
for (const entry of readdirSync(current)) {
const fullPath = path.join(current, entry);
const stats = statSync(fullPath);
if (stats.isDirectory()) {
stack.push(fullPath);
} else if (SOURCE_FILE_EXTENSIONS.some((ext) => entry.endsWith(ext))) {
files.push(fullPath);
}
}
}
return files;
};
const processFile = async (filePath: string) => {
const content = readFileSync(filePath, "utf8");
if (!content.trim()) {
console.log(`Skipping ${filePath} because it is empty`);
return;
}
const checksum = getChecksum(filePath);
const existing = await db.execute(
"SELECT checksum FROM file_embeddings WHERE file_path = ?",
[filePath],
);
const row = existing.rows[0];
if (row && row.checksum === checksum) {
console.log(`Skipping ${filePath} because it already exists`);
return;
}
const embedding = await getEmbedding(content);
await db.execute(
"INSERT INTO file_embeddings (file_path, checksum, embedding) VALUES (?, ?, ?)",
[filePath, checksum, embedding],
);
};
const targetDirectory = process.argv[2] ?? ".";
if (!statSync(targetDirectory).isDirectory()) {
console.error(`Directory '${targetDirectory}' not found`);
process.exit(1);
}
const files = findFiles(targetDirectory);
for (let n = 0; n < files.length; ++n) {
console.log(`Processing ${files[n]} (${n + 1}/${files.length})`);
await processFile(files[n]);
}
console.log("Done");
process.exit(0);