anya/tools/notes-vectors.ts

import { createClient } from "webdav";
import {
  PGVectorStore,
  DistanceStrategy,
} from "@langchain/community/vectorstores/pgvector";
import { OpenAIEmbeddings } from "@langchain/openai";
import { v4 as uuidv4 } from "uuid";
import * as crypto from "crypto";
import skmeans from "skmeans";

let isSyncing = false;
let isCleanupRunning = false;

// Initialize WebDAV client
const webdavClient = createClient(
  "http://192.168.29.85/remote.php/dav/files/raj/",
  {
    username: process.env.NEXTCLOUD_USERNAME!,
    password: process.env.NEXTCLOUD_PASSWORD!,
  }
);

// Helper function to calculate checksum of content
function calculateChecksum(content: string): string {
  return crypto.createHash("md5").update(content, "utf8").digest("hex");
}

// Function to get all files from 'notes' directory via WebDAV
async function getAllFiles(
  path: string
): Promise<{ filename: string; content: string }[]> {
  const contents = await webdavClient.getDirectoryContents(path, {
    deep: true,
  });

  const files = Array.isArray(contents) ? contents : contents.data;

  const fileContents: { filename: string; content: string }[] = [];

  for (const file of files) {
    if (
      file.type === "file" &&
      !file.basename.startsWith(".") &&
      !file.filename.includes("/.obsidian/") &&
      !file.filename.includes("prompts/") &&
      (file.filename.endsWith(".txt") || file.filename.endsWith(".md"))
    ) {
      const content = await webdavClient.getFileContents(file.filename, {
        format: "text",
      });
      if (typeof content === "string") {
        fileContents.push({ filename: file.filename, content });
      }
    }
  }

  return fileContents;
}

// Setup PGVectorStore
const embeddings = new OpenAIEmbeddings({
  model: "text-embedding-ada-002",
});

const config = {
  postgresConnectionOptions: {
    type: "postgres",
    host: "127.0.0.1",
    port: 5432,
    user: "postgres",
    password: "defaultpwd",
    database: "postgres",
  },
  tableName: "anya",
  columns: {
    idColumnName: "id",
    vectorColumnName: "vector",
    contentColumnName: "content",
    metadataColumnName: "metadata",
    clusterColumnName: "cluster",
  },
  distanceStrategy: "cosine" as DistanceStrategy,
};

const vectorStore = await PGVectorStore.initialize(embeddings, config);

const CLUSTER_COUNT = 4;

// Main function to sync vector store
export async function syncVectorStore() {
  if (isSyncing) {
    console.log("syncVectorStore is already running. Skipping this run.");
    return;
  }

  isSyncing = true;
  try {
    console.log("Starting vector store sync...");
    const files = await getAllFiles("notes");

    let filesIndexed = 0;

    for (const file of files) {
      const content = `filename: ${file.filename}\n${file.content}`;
      // Calculate checksum
      const checksum = calculateChecksum(content);

      // Check if the document already exists using direct SQL query
      const queryResult = await vectorStore.client?.query(
        `SELECT * FROM ${config.tableName} WHERE metadata->>'filename' = $1`,
        [file.filename]
      );

      if (queryResult && queryResult.rows.length > 0) {
        const existingDocument = queryResult.rows[0];
        const existingChecksum = existingDocument.metadata?.checksum;

        // If the checksum matches, skip updating
        if (existingChecksum === checksum) {
          continue;
        }

        // If the content is different, delete the old version
        await vectorStore.delete({ ids: [existingDocument.id] });
        console.log(`Deleted old version of ${file.filename}`);
      }

      // Load the document
      const document = {
        pageContent: content,
        metadata: { checksum, filename: file.filename, id: uuidv4() },
      };

      // Add or update the document in the vector store
      await vectorStore.addDocuments([document], {
        ids: [document.metadata.id],
      });
      filesIndexed++;
      console.log(`Indexed ${file.filename}`);
    }
    filesIndexed > 0 && (await runClustering());
    console.log("Vector store sync completed.");
  } catch (error) {
    console.error("Error during vector store sync:", error);
  } finally {
    isSyncing = false;
  }
}

// Function to remove deleted files from vector store
export async function cleanupDeletedFiles() {
  if (isCleanupRunning) {
    console.log("cleanupDeletedFiles is already running. Skipping this run.");
    return;
  }

  isCleanupRunning = true;
  try {
    console.log("Starting cleanup of deleted files...");

    // Get the list of all files in the vector store
    const queryResult = await vectorStore.client?.query(
      `SELECT metadata->>'filename' AS filename, id FROM ${config.tableName}`
    );

    if (queryResult) {
      const dbFiles = queryResult.rows;
      const files = await getAllFiles("notes");
      const existingFilenames = files.map((file) => file.filename);
      let deletedFiles = 0;

      for (const dbFile of dbFiles) {
        if (!existingFilenames.includes(dbFile.filename)) {
          // Delete the file from the vector store if it no longer exists in notes
          await vectorStore.delete({ ids: [dbFile.id] });
          deletedFiles++;
          console.log(
            `Deleted ${dbFile.filename} from vector store as it no longer exists.`
          );
        }
      }
      deletedFiles > 0 && (await runClustering());
    }

    console.log("Cleanup of deleted files completed.");
  } catch (error) {
    console.error("Error during cleanup of deleted files:", error);
  } finally {
    isCleanupRunning = false;
  }
}

// Ensure the cluster column exists in the table
async function ensureClusterColumn() {
  await vectorStore.client?.query(
    `ALTER TABLE ${config.tableName} ADD COLUMN IF NOT EXISTS ${config.columns.clusterColumnName} INT;`
  );
  console.log("Ensured cluster column exists in the database.");
}

// Function to generate clusters from stored embeddings and save them to the database
async function generateClusters(k: number) {
  // Ensure the cluster column exists before proceeding
  await ensureClusterColumn();

  const queryResult = await vectorStore.client?.query(
    `SELECT ${config.columns.idColumnName} as id, ${config.columns.vectorColumnName} as vector
     FROM ${config.tableName}`
  );

  if (!queryResult) {
    console.log("No embeddings found in the vector store.");
    return;
  }

  // Process embeddings and format data
  const embeddings = queryResult.rows.map((row) => {
    let vector: number[] = [];

    // Check vector data format and convert to number array if needed
    if (Array.isArray(row.vector)) {
      vector = row.vector;
    } else if (typeof row.vector === "string") {
      vector = JSON.parse(row.vector);
    } else if (Buffer.isBuffer(row.vector)) {
      vector = Array.from(row.vector);
    } else {
      console.error("Unknown vector format:", row.vector);
    }

    return {
      id: row.id,
      vector,
    };
  });

  // Extract vectors for clustering
  const vectors = embeddings.map((doc) => doc.vector);

  // Run clustering algorithm (K-means)
  const result = skmeans(vectors, k);

  // Save each document’s cluster label in the database
  for (const [index, doc] of embeddings.entries()) {
    const cluster = result.idxs[index];
    await vectorStore.client?.query(
      `UPDATE ${config.tableName} SET ${config.columns.clusterColumnName} = $1 WHERE ${config.columns.idColumnName} = $2`,
      [cluster, doc.id]
    );
    console.log(`Document ID: ${doc.id} assigned to Cluster: ${cluster}`);
  }

  console.log("Cluster assignments saved to database.");
}

// Exported function to run clustering
export async function runClustering() {
  const k = CLUSTER_COUNT;
  console.log("Generating clusters...");
  await generateClusters(k);
}

export async function initVectorStoreSync() {
  console.log("Starting vector store sync...");
  await syncVectorStore();
  setInterval(syncVectorStore, 1000 * 60 * 2); // Every 2 minutes
  await cleanupDeletedFiles();
  setInterval(cleanupDeletedFiles, 1000 * 60 * 60 * 2); // Every 12 hours
}

export function semantic_search_notes(query: string, limit: number) {
  return vectorStore.similaritySearch(query, limit);
}

export async function getClusteredFiles(): Promise<Record<string, string[]>> {
  const result: Record<string, string[]> = {};

  // Query to get filenames and their respective cluster assignments
  const queryResult = await vectorStore.client?.query(
    `SELECT ${config.columns.metadataColumnName}->>'filename' AS filename, ${config.columns.clusterColumnName} AS cluster
     FROM ${config.tableName}`
  );

  if (!queryResult) {
    console.log("No clustered files found in the vector store.");
    return result;
  }

  // Group filenames by cluster
  queryResult.rows.forEach((row) => {
    const clusterName = `Cluster ${row.cluster}`; // Format the cluster name
    const filename = row.filename;

    if (!result[clusterName]) {
      result[clusterName] = [];
    }
    result[clusterName].push(filename);
  });

  console.log("Clustered files:", result);
  return result;
}