303 lines
8.9 KiB
TypeScript
303 lines
8.9 KiB
TypeScript
import { createClient } from "webdav";
|
||
import {
|
||
PGVectorStore,
|
||
DistanceStrategy,
|
||
} from "@langchain/community/vectorstores/pgvector";
|
||
import { OpenAIEmbeddings } from "@langchain/openai";
|
||
import { v4 as uuidv4 } from "uuid";
|
||
import * as crypto from "crypto";
|
||
import skmeans from "skmeans";
|
||
|
||
let isSyncing = false;
|
||
let isCleanupRunning = false;
|
||
|
||
// Initialize WebDAV client
|
||
const webdavClient = createClient(
|
||
"http://192.168.29.85/remote.php/dav/files/raj/",
|
||
{
|
||
username: process.env.NEXTCLOUD_USERNAME!,
|
||
password: process.env.NEXTCLOUD_PASSWORD!,
|
||
}
|
||
);
|
||
|
||
// Helper function to calculate checksum of content
|
||
function calculateChecksum(content: string): string {
|
||
return crypto.createHash("md5").update(content, "utf8").digest("hex");
|
||
}
|
||
|
||
// Function to get all files from 'notes' directory via WebDAV
|
||
async function getAllFiles(
|
||
path: string
|
||
): Promise<{ filename: string; content: string }[]> {
|
||
const contents = await webdavClient.getDirectoryContents(path, {
|
||
deep: true,
|
||
});
|
||
|
||
const files = Array.isArray(contents) ? contents : contents.data;
|
||
|
||
const fileContents: { filename: string; content: string }[] = [];
|
||
|
||
for (const file of files) {
|
||
if (
|
||
file.type === "file" &&
|
||
!file.basename.startsWith(".") &&
|
||
!file.filename.includes("/.obsidian/") &&
|
||
!file.filename.includes("prompts/") &&
|
||
(file.filename.endsWith(".txt") || file.filename.endsWith(".md"))
|
||
) {
|
||
const content = await webdavClient.getFileContents(file.filename, {
|
||
format: "text",
|
||
});
|
||
if (typeof content === "string") {
|
||
fileContents.push({ filename: file.filename, content });
|
||
}
|
||
}
|
||
}
|
||
|
||
return fileContents;
|
||
}
|
||
|
||
// Setup PGVectorStore
|
||
const embeddings = new OpenAIEmbeddings({
|
||
model: "text-embedding-ada-002",
|
||
});
|
||
|
||
const config = {
|
||
postgresConnectionOptions: {
|
||
type: "postgres",
|
||
host: "127.0.0.1",
|
||
port: 5432,
|
||
user: "postgres",
|
||
password: "defaultpwd",
|
||
database: "postgres",
|
||
},
|
||
tableName: "anya",
|
||
columns: {
|
||
idColumnName: "id",
|
||
vectorColumnName: "vector",
|
||
contentColumnName: "content",
|
||
metadataColumnName: "metadata",
|
||
clusterColumnName: "cluster",
|
||
},
|
||
distanceStrategy: "cosine" as DistanceStrategy,
|
||
};
|
||
|
||
const vectorStore = await PGVectorStore.initialize(embeddings, config);
|
||
|
||
const CLUSTER_COUNT = 4;
|
||
|
||
// Main function to sync vector store
|
||
export async function syncVectorStore() {
|
||
if (isSyncing) {
|
||
console.log("syncVectorStore is already running. Skipping this run.");
|
||
return;
|
||
}
|
||
|
||
isSyncing = true;
|
||
try {
|
||
console.log("Starting vector store sync...");
|
||
const files = await getAllFiles("notes");
|
||
|
||
let filesIndexed = 0;
|
||
|
||
for (const file of files) {
|
||
const content = `filename: ${file.filename}\n${file.content}`;
|
||
// Calculate checksum
|
||
const checksum = calculateChecksum(content);
|
||
|
||
// Check if the document already exists using direct SQL query
|
||
const queryResult = await vectorStore.client?.query(
|
||
`SELECT * FROM ${config.tableName} WHERE metadata->>'filename' = $1`,
|
||
[file.filename]
|
||
);
|
||
|
||
if (queryResult && queryResult.rows.length > 0) {
|
||
const existingDocument = queryResult.rows[0];
|
||
const existingChecksum = existingDocument.metadata?.checksum;
|
||
|
||
// If the checksum matches, skip updating
|
||
if (existingChecksum === checksum) {
|
||
continue;
|
||
}
|
||
|
||
// If the content is different, delete the old version
|
||
await vectorStore.delete({ ids: [existingDocument.id] });
|
||
console.log(`Deleted old version of ${file.filename}`);
|
||
}
|
||
|
||
// Load the document
|
||
const document = {
|
||
pageContent: content,
|
||
metadata: { checksum, filename: file.filename, id: uuidv4() },
|
||
};
|
||
|
||
// Add or update the document in the vector store
|
||
await vectorStore.addDocuments([document], {
|
||
ids: [document.metadata.id],
|
||
});
|
||
filesIndexed++;
|
||
console.log(`Indexed ${file.filename}`);
|
||
}
|
||
filesIndexed > 0 && (await runClustering());
|
||
console.log("Vector store sync completed.");
|
||
} catch (error) {
|
||
console.error("Error during vector store sync:", error);
|
||
} finally {
|
||
isSyncing = false;
|
||
}
|
||
}
|
||
|
||
// Function to remove deleted files from vector store
|
||
export async function cleanupDeletedFiles() {
|
||
if (isCleanupRunning) {
|
||
console.log("cleanupDeletedFiles is already running. Skipping this run.");
|
||
return;
|
||
}
|
||
|
||
isCleanupRunning = true;
|
||
try {
|
||
console.log("Starting cleanup of deleted files...");
|
||
|
||
// Get the list of all files in the vector store
|
||
const queryResult = await vectorStore.client?.query(
|
||
`SELECT metadata->>'filename' AS filename, id FROM ${config.tableName}`
|
||
);
|
||
|
||
if (queryResult) {
|
||
const dbFiles = queryResult.rows;
|
||
const files = await getAllFiles("notes");
|
||
const existingFilenames = files.map((file) => file.filename);
|
||
let deletedFiles = 0;
|
||
|
||
for (const dbFile of dbFiles) {
|
||
if (!existingFilenames.includes(dbFile.filename)) {
|
||
// Delete the file from the vector store if it no longer exists in notes
|
||
await vectorStore.delete({ ids: [dbFile.id] });
|
||
deletedFiles++;
|
||
console.log(
|
||
`Deleted ${dbFile.filename} from vector store as it no longer exists.`
|
||
);
|
||
}
|
||
}
|
||
deletedFiles > 0 && (await runClustering());
|
||
}
|
||
|
||
console.log("Cleanup of deleted files completed.");
|
||
} catch (error) {
|
||
console.error("Error during cleanup of deleted files:", error);
|
||
} finally {
|
||
isCleanupRunning = false;
|
||
}
|
||
}
|
||
|
||
// Ensure the cluster column exists in the table
|
||
async function ensureClusterColumn() {
|
||
await vectorStore.client?.query(
|
||
`ALTER TABLE ${config.tableName} ADD COLUMN IF NOT EXISTS ${config.columns.clusterColumnName} INT;`
|
||
);
|
||
console.log("Ensured cluster column exists in the database.");
|
||
}
|
||
|
||
// Function to generate clusters from stored embeddings and save them to the database
|
||
async function generateClusters(k: number) {
|
||
// Ensure the cluster column exists before proceeding
|
||
await ensureClusterColumn();
|
||
|
||
const queryResult = await vectorStore.client?.query(
|
||
`SELECT ${config.columns.idColumnName} as id, ${config.columns.vectorColumnName} as vector
|
||
FROM ${config.tableName}`
|
||
);
|
||
|
||
if (!queryResult) {
|
||
console.log("No embeddings found in the vector store.");
|
||
return;
|
||
}
|
||
|
||
// Process embeddings and format data
|
||
const embeddings = queryResult.rows.map((row) => {
|
||
let vector: number[] = [];
|
||
|
||
// Check vector data format and convert to number array if needed
|
||
if (Array.isArray(row.vector)) {
|
||
vector = row.vector;
|
||
} else if (typeof row.vector === "string") {
|
||
vector = JSON.parse(row.vector);
|
||
} else if (Buffer.isBuffer(row.vector)) {
|
||
vector = Array.from(row.vector);
|
||
} else {
|
||
console.error("Unknown vector format:", row.vector);
|
||
}
|
||
|
||
return {
|
||
id: row.id,
|
||
vector,
|
||
};
|
||
});
|
||
|
||
// Extract vectors for clustering
|
||
const vectors = embeddings.map((doc) => doc.vector);
|
||
|
||
// Run clustering algorithm (K-means)
|
||
const result = skmeans(vectors, k);
|
||
|
||
// Save each document’s cluster label in the database
|
||
for (const [index, doc] of embeddings.entries()) {
|
||
const cluster = result.idxs[index];
|
||
await vectorStore.client?.query(
|
||
`UPDATE ${config.tableName} SET ${config.columns.clusterColumnName} = $1 WHERE ${config.columns.idColumnName} = $2`,
|
||
[cluster, doc.id]
|
||
);
|
||
console.log(`Document ID: ${doc.id} assigned to Cluster: ${cluster}`);
|
||
}
|
||
|
||
console.log("Cluster assignments saved to database.");
|
||
}
|
||
|
||
// Exported function to run clustering
|
||
export async function runClustering() {
|
||
const k = CLUSTER_COUNT;
|
||
console.log("Generating clusters...");
|
||
await generateClusters(k);
|
||
}
|
||
|
||
export async function initVectorStoreSync() {
|
||
console.log("Starting vector store sync...");
|
||
await syncVectorStore();
|
||
setInterval(syncVectorStore, 1000 * 60 * 2); // Every 2 minutes
|
||
await cleanupDeletedFiles();
|
||
setInterval(cleanupDeletedFiles, 1000 * 60 * 60 * 2); // Every 12 hours
|
||
}
|
||
|
||
export function semantic_search_notes(query: string, limit: number) {
|
||
return vectorStore.similaritySearch(query, limit);
|
||
}
|
||
|
||
export async function getClusteredFiles(): Promise<Record<string, string[]>> {
|
||
const result: Record<string, string[]> = {};
|
||
|
||
// Query to get filenames and their respective cluster assignments
|
||
const queryResult = await vectorStore.client?.query(
|
||
`SELECT ${config.columns.metadataColumnName}->>'filename' AS filename, ${config.columns.clusterColumnName} AS cluster
|
||
FROM ${config.tableName}`
|
||
);
|
||
|
||
if (!queryResult) {
|
||
console.log("No clustered files found in the vector store.");
|
||
return result;
|
||
}
|
||
|
||
// Group filenames by cluster
|
||
queryResult.rows.forEach((row) => {
|
||
const clusterName = `Cluster ${row.cluster}`; // Format the cluster name
|
||
const filename = row.filename;
|
||
|
||
if (!result[clusterName]) {
|
||
result[clusterName] = [];
|
||
}
|
||
result[clusterName].push(filename);
|
||
});
|
||
|
||
console.log("Clustered files:", result);
|
||
return result;
|
||
}
|