mirror of
https://github.com/isledecomp/isle.git
synced 2026-01-12 03:01:17 +00:00
105 lines
2.9 KiB
TypeScript
105 lines
2.9 KiB
TypeScript
import { config } from "dotenv";
|
|
import { createHash } from "node:crypto";
|
|
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
import path from "node:path";
|
|
import { z } from "zod";
|
|
import { OpenAI } from "openai";
|
|
import { createClient } from "@libsql/client";
|
|
|
|
// configuration
|
|
const DB_PATH = "file:code_embeddings.db";
|
|
const SOURCE_FILE_EXTENSIONS = [".c", ".cpp", ".h", ".hpp", ".hxx"];
|
|
const OPENAI_MODEL = "text-embedding-3-large";
|
|
const EMBEDDING_DIMENSION = 3072;
|
|
|
|
// env
|
|
config();
|
|
const EnvSchema = z.object({ OPENAI_API_KEY: z.string() });
|
|
const { OPENAI_API_KEY } = EnvSchema.parse(process.env);
|
|
|
|
// clients
|
|
const openai = new OpenAI({ apiKey: OPENAI_API_KEY });
|
|
const db = createClient({ url: DB_PATH });
|
|
|
|
await db.execute(
|
|
`CREATE TABLE IF NOT EXISTS file_embeddings (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
file_path TEXT UNIQUE NOT NULL,
|
|
checksum TEXT NOT NULL,
|
|
embedding F32_BLOB(${EMBEDDING_DIMENSION}) NOT NULL,
|
|
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)`,
|
|
);
|
|
|
|
const getChecksum = (filePath: string): string => {
|
|
const hash = createHash("sha256");
|
|
hash.update(readFileSync(filePath));
|
|
return hash.digest("hex");
|
|
};
|
|
|
|
const getEmbedding = async (content: string): Promise<number[]> => {
|
|
const { data } = await openai.embeddings.create({
|
|
model: OPENAI_MODEL,
|
|
input: content,
|
|
encoding_format: "float",
|
|
});
|
|
return data[0].embedding as unknown as number[];
|
|
};
|
|
|
|
const findFiles = (startDir: string): string[] => {
|
|
const stack = [startDir];
|
|
const files: string[] = [];
|
|
while (stack.length) {
|
|
const current = stack.pop() as string;
|
|
for (const entry of readdirSync(current)) {
|
|
const fullPath = path.join(current, entry);
|
|
const stats = statSync(fullPath);
|
|
if (stats.isDirectory()) {
|
|
stack.push(fullPath);
|
|
} else if (SOURCE_FILE_EXTENSIONS.some((ext) => entry.endsWith(ext))) {
|
|
files.push(fullPath);
|
|
}
|
|
}
|
|
}
|
|
return files;
|
|
};
|
|
|
|
const processFile = async (filePath: string) => {
|
|
const content = readFileSync(filePath, "utf8");
|
|
if (!content.trim()) {
|
|
console.log(`Skipping ${filePath} because it is empty`);
|
|
return;
|
|
}
|
|
|
|
const checksum = getChecksum(filePath);
|
|
const existing = await db.execute(
|
|
"SELECT checksum FROM file_embeddings WHERE file_path = ?",
|
|
[filePath],
|
|
);
|
|
const row = existing.rows[0];
|
|
if (row && row.checksum === checksum) {
|
|
console.log(`Skipping ${filePath} because it already exists`);
|
|
return;
|
|
}
|
|
|
|
const embedding = await getEmbedding(content);
|
|
|
|
await db.execute(
|
|
"INSERT INTO file_embeddings (file_path, checksum, embedding) VALUES (?, ?, ?)",
|
|
[filePath, checksum, embedding],
|
|
);
|
|
};
|
|
|
|
const targetDirectory = process.argv[2] ?? ".";
|
|
if (!statSync(targetDirectory).isDirectory()) {
|
|
console.error(`Directory '${targetDirectory}' not found`);
|
|
process.exit(1);
|
|
}
|
|
const files = findFiles(targetDirectory);
|
|
for (let n = 0; n < files.length; ++n) {
|
|
console.log(`Processing ${files[n]} (${n + 1}/${files.length})`);
|
|
await processFile(files[n]);
|
|
}
|
|
console.log("Done");
|
|
process.exit(0);
|