2024-11-02 12:48:38 +05:30
|
|
|
import { Client } from "pg";
|
|
|
|
import skmeans from "skmeans";
|
2024-10-31 12:42:08 +05:30
|
|
|
|
2024-11-02 12:48:38 +05:30
|
|
|
const config = {
|
|
|
|
postgresConnectionOptions: {
|
|
|
|
host: "127.0.0.1",
|
|
|
|
port: 5432,
|
|
|
|
user: "postgres",
|
|
|
|
password: "defaultpwd",
|
|
|
|
database: "postgres",
|
|
|
|
},
|
|
|
|
tableName: "anya",
|
|
|
|
columns: {
|
|
|
|
idColumnName: "id",
|
|
|
|
vectorColumnName: "vector",
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
// Fetch embeddings from PostgreSQL with data inspection
|
|
|
|
async function fetchEmbeddings(): Promise<{ id: string; vector: number[] }[]> {
|
|
|
|
const client = new Client(config.postgresConnectionOptions);
|
|
|
|
await client.connect();
|
|
|
|
|
|
|
|
const res = await client.query(
|
|
|
|
`SELECT ${config.columns.idColumnName} as id, ${config.columns.vectorColumnName} as vector
|
|
|
|
FROM ${config.tableName} LIMIT 5`
|
|
|
|
);
|
|
|
|
await client.end();
|
|
|
|
|
|
|
|
// Inspect the data format of each vector
|
|
|
|
return res.rows.map((row, index) => {
|
|
|
|
console.log(`Row ${index} - Vector Type:`, typeof row.vector);
|
|
|
|
console.log(`Row ${index} - Vector Data:`, row.vector);
|
|
|
|
|
|
|
|
let vector: number[] = [];
|
|
|
|
|
|
|
|
// Determine the correct format based on observed type
|
|
|
|
if (Array.isArray(row.vector)) {
|
|
|
|
vector = row.vector; // If it's already an array, use as-is
|
|
|
|
} else if (typeof row.vector === "string") {
|
|
|
|
vector = JSON.parse(row.vector); // If string, parse as JSON
|
|
|
|
} else if (Buffer.isBuffer(row.vector)) {
|
|
|
|
vector = Array.from(row.vector); // If Buffer, convert to array of numbers
|
|
|
|
} else {
|
|
|
|
console.error("Unknown vector format:", row.vector);
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
id: row.id,
|
|
|
|
vector,
|
|
|
|
};
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run clustering on fetched embeddings
|
|
|
|
async function listClusters() {
|
|
|
|
const embeddings = await fetchEmbeddings();
|
|
|
|
const vectors = embeddings.map((doc) => doc.vector);
|
|
|
|
|
|
|
|
// Validate the format and contents of the vectors
|
|
|
|
vectors.forEach((vector, index) => {
|
|
|
|
if (!Array.isArray(vector) || vector.some(isNaN)) {
|
|
|
|
console.error(`Invalid vector at index ${index}:`, vector);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// Run K-means clustering with a specified number of clusters
|
|
|
|
const k = 3; // Number of clusters
|
|
|
|
const result = skmeans(vectors, k);
|
|
|
|
|
|
|
|
// Log the cluster assignment for each document
|
|
|
|
embeddings.forEach((doc, index) => {
|
|
|
|
console.log(`Document ID: ${doc.id}, Cluster: ${result.idxs[index]}`);
|
|
|
|
});
|
|
|
|
|
|
|
|
console.log("Cluster assignments:", result.idxs);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Execute clustering function
|
|
|
|
listClusters().catch(console.error);
|