anya/tools/notes-vectors.ts

303 lines
8.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { createClient } from "webdav";
import {
PGVectorStore,
DistanceStrategy,
} from "@langchain/community/vectorstores/pgvector";
import { OpenAIEmbeddings } from "@langchain/openai";
import { v4 as uuidv4 } from "uuid";
import * as crypto from "crypto";
import skmeans from "skmeans";
let isSyncing = false;
let isCleanupRunning = false;
// Initialize WebDAV client
const webdavClient = createClient(
"http://192.168.29.85/remote.php/dav/files/raj/",
{
username: process.env.NEXTCLOUD_USERNAME!,
password: process.env.NEXTCLOUD_PASSWORD!,
}
);
// Helper function to calculate checksum of content
function calculateChecksum(content: string): string {
return crypto.createHash("md5").update(content, "utf8").digest("hex");
}
// Function to get all files from 'notes' directory via WebDAV
async function getAllFiles(
path: string
): Promise<{ filename: string; content: string }[]> {
const contents = await webdavClient.getDirectoryContents(path, {
deep: true,
});
const files = Array.isArray(contents) ? contents : contents.data;
const fileContents: { filename: string; content: string }[] = [];
for (const file of files) {
if (
file.type === "file" &&
!file.basename.startsWith(".") &&
!file.filename.includes("/.obsidian/") &&
!file.filename.includes("prompts/") &&
(file.filename.endsWith(".txt") || file.filename.endsWith(".md"))
) {
const content = await webdavClient.getFileContents(file.filename, {
format: "text",
});
if (typeof content === "string") {
fileContents.push({ filename: file.filename, content });
}
}
}
return fileContents;
}
// Setup PGVectorStore
const embeddings = new OpenAIEmbeddings({
model: "text-embedding-ada-002",
});
const config = {
postgresConnectionOptions: {
type: "postgres",
host: "127.0.0.1",
port: 5432,
user: "postgres",
password: "defaultpwd",
database: "postgres",
},
tableName: "anya",
columns: {
idColumnName: "id",
vectorColumnName: "vector",
contentColumnName: "content",
metadataColumnName: "metadata",
clusterColumnName: "cluster",
},
distanceStrategy: "cosine" as DistanceStrategy,
};
const vectorStore = await PGVectorStore.initialize(embeddings, config);
const CLUSTER_COUNT = 4;
// Main function to sync vector store
export async function syncVectorStore() {
if (isSyncing) {
console.log("syncVectorStore is already running. Skipping this run.");
return;
}
isSyncing = true;
try {
console.log("Starting vector store sync...");
const files = await getAllFiles("notes");
let filesIndexed = 0;
for (const file of files) {
const content = `filename: ${file.filename}\n${file.content}`;
// Calculate checksum
const checksum = calculateChecksum(content);
// Check if the document already exists using direct SQL query
const queryResult = await vectorStore.client?.query(
`SELECT * FROM ${config.tableName} WHERE metadata->>'filename' = $1`,
[file.filename]
);
if (queryResult && queryResult.rows.length > 0) {
const existingDocument = queryResult.rows[0];
const existingChecksum = existingDocument.metadata?.checksum;
// If the checksum matches, skip updating
if (existingChecksum === checksum) {
continue;
}
// If the content is different, delete the old version
await vectorStore.delete({ ids: [existingDocument.id] });
console.log(`Deleted old version of ${file.filename}`);
}
// Load the document
const document = {
pageContent: content,
metadata: { checksum, filename: file.filename, id: uuidv4() },
};
// Add or update the document in the vector store
await vectorStore.addDocuments([document], {
ids: [document.metadata.id],
});
filesIndexed++;
console.log(`Indexed ${file.filename}`);
}
filesIndexed > 0 && (await runClustering());
console.log("Vector store sync completed.");
} catch (error) {
console.error("Error during vector store sync:", error);
} finally {
isSyncing = false;
}
}
// Function to remove deleted files from vector store
export async function cleanupDeletedFiles() {
if (isCleanupRunning) {
console.log("cleanupDeletedFiles is already running. Skipping this run.");
return;
}
isCleanupRunning = true;
try {
console.log("Starting cleanup of deleted files...");
// Get the list of all files in the vector store
const queryResult = await vectorStore.client?.query(
`SELECT metadata->>'filename' AS filename, id FROM ${config.tableName}`
);
if (queryResult) {
const dbFiles = queryResult.rows;
const files = await getAllFiles("notes");
const existingFilenames = files.map((file) => file.filename);
let deletedFiles = 0;
for (const dbFile of dbFiles) {
if (!existingFilenames.includes(dbFile.filename)) {
// Delete the file from the vector store if it no longer exists in notes
await vectorStore.delete({ ids: [dbFile.id] });
deletedFiles++;
console.log(
`Deleted ${dbFile.filename} from vector store as it no longer exists.`
);
}
}
deletedFiles > 0 && (await runClustering());
}
console.log("Cleanup of deleted files completed.");
} catch (error) {
console.error("Error during cleanup of deleted files:", error);
} finally {
isCleanupRunning = false;
}
}
// Ensure the cluster column exists in the table
async function ensureClusterColumn() {
await vectorStore.client?.query(
`ALTER TABLE ${config.tableName} ADD COLUMN IF NOT EXISTS ${config.columns.clusterColumnName} INT;`
);
console.log("Ensured cluster column exists in the database.");
}
// Function to generate clusters from stored embeddings and save them to the database
async function generateClusters(k: number) {
// Ensure the cluster column exists before proceeding
await ensureClusterColumn();
const queryResult = await vectorStore.client?.query(
`SELECT ${config.columns.idColumnName} as id, ${config.columns.vectorColumnName} as vector
FROM ${config.tableName}`
);
if (!queryResult) {
console.log("No embeddings found in the vector store.");
return;
}
// Process embeddings and format data
const embeddings = queryResult.rows.map((row) => {
let vector: number[] = [];
// Check vector data format and convert to number array if needed
if (Array.isArray(row.vector)) {
vector = row.vector;
} else if (typeof row.vector === "string") {
vector = JSON.parse(row.vector);
} else if (Buffer.isBuffer(row.vector)) {
vector = Array.from(row.vector);
} else {
console.error("Unknown vector format:", row.vector);
}
return {
id: row.id,
vector,
};
});
// Extract vectors for clustering
const vectors = embeddings.map((doc) => doc.vector);
// Run clustering algorithm (K-means)
const result = skmeans(vectors, k);
// Save each documents cluster label in the database
for (const [index, doc] of embeddings.entries()) {
const cluster = result.idxs[index];
await vectorStore.client?.query(
`UPDATE ${config.tableName} SET ${config.columns.clusterColumnName} = $1 WHERE ${config.columns.idColumnName} = $2`,
[cluster, doc.id]
);
console.log(`Document ID: ${doc.id} assigned to Cluster: ${cluster}`);
}
console.log("Cluster assignments saved to database.");
}
// Exported function to run clustering
export async function runClustering() {
const k = CLUSTER_COUNT;
console.log("Generating clusters...");
await generateClusters(k);
}
export async function initVectorStoreSync() {
console.log("Starting vector store sync...");
await syncVectorStore();
setInterval(syncVectorStore, 1000 * 60 * 2); // Every 2 minutes
await cleanupDeletedFiles();
setInterval(cleanupDeletedFiles, 1000 * 60 * 60 * 2); // Every 12 hours
}
export function semantic_search_notes(query: string, limit: number) {
return vectorStore.similaritySearch(query, limit);
}
export async function getClusteredFiles(): Promise<Record<string, string[]>> {
const result: Record<string, string[]> = {};
// Query to get filenames and their respective cluster assignments
const queryResult = await vectorStore.client?.query(
`SELECT ${config.columns.metadataColumnName}->>'filename' AS filename, ${config.columns.clusterColumnName} AS cluster
FROM ${config.tableName}`
);
if (!queryResult) {
console.log("No clustered files found in the vector store.");
return result;
}
// Group filenames by cluster
queryResult.rows.forEach((row) => {
const clusterName = `Cluster ${row.cluster}`; // Format the cluster name
const filename = row.filename;
if (!result[clusterName]) {
result[clusterName] = [];
}
result[clusterName].push(filename);
});
console.log("Clustered files:", result);
return result;
}