feat: support voice & image data as input to event

2024-10-08 22:19:46 +05:30 · 2024-10-08 22:19:46 +05:30 · b2ad77e5b5
parent 97add456a7
commit b2ad77e5b5
7 changed files with 192 additions and 71 deletions
--- a/interfaces/discord.ts
+++ b/interfaces/discord.ts
@ -277,7 +277,9 @@ export class DiscordAdapter implements PlatformAdapter {
      isDirectMessage: async () =>
        discordMessage.channel.type === ChannelType.DM,
      send: async (messageData) => {
-        const sentMessage = await discordMessage.channel.send(messageData);
+        const sentMessage = await (discordMessage.channel as TextChannel).send(
+          messageData
+        );
        return this.convertSentMessage(sentMessage);
      },
      reply: async (messageData) => {
@ -309,12 +311,12 @@ export class DiscordAdapter implements PlatformAdapter {
        return Promise.all(messages.map((msg) => this.convertMessage(msg)));
      },
      sendFile: async (fileUrl, fileName) => {
-        await discordMessage.channel.send({
+        await (discordMessage.channel as TextChannel).send({
          files: [{ attachment: fileUrl, name: fileName }],
        });
      },
      sendTyping: async () => {
-        await discordMessage.channel.sendTyping();
+        await (discordMessage.channel as TextChannel).sendTyping();
      },
    };

@ -366,19 +368,21 @@ export class DiscordAdapter implements PlatformAdapter {
        return Promise.all(messages.map((msg) => this.convertMessage(msg)));
      },
      sendFile: async (fileUrl, fileName) => {
-        await discordMessage.channel.send({
+        await (discordMessage.channel as TextChannel).send({
          files: [{ attachment: fileUrl, name: fileName }],
        });
      },
      sendTyping: async () => {
-        await discordMessage.channel.sendTyping();
+        await (discordMessage.channel as TextChannel).sendTyping();
      },
      reply: async (messageData) => {
        const sentMessage = await discordMessage.reply(messageData);
        return this.convertSentMessage(sentMessage);
      },
      send: async (messageData) => {
-        const sentMessage = await discordMessage.channel.send(messageData);
+        const sentMessage = await (discordMessage.channel as TextChannel).send(
+          messageData
+        );
        return this.convertSentMessage(sentMessage);
      },
    };
--- a/interfaces/events.ts
+++ b/interfaces/events.ts
@ -1,6 +1,7 @@
 import { Elysia, t } from "elysia";
 import { userConfigs } from "../config";
 import { send_sys_log } from "./log";
+import { get_transcription } from "../tools/ask";

 // Define the type for the event callback
 type EventCallback = (
@ -187,7 +188,7 @@ export const events = new Elysia()
          body = textbody;
        }
      }
-      // console.log("Event received", body);
+      console.log("Event received", body);

      if (id === "ping") {
        send_sys_log(`Ping event received: ${JSON.stringify(body)}`);
--- a/interfaces/message.ts
+++ b/interfaces/message.ts
@ -28,6 +28,8 @@ export interface Embed {
 export interface MessageData {
  content?: string;
  embeds?: Embed[];
+  options?: any;
+  flags?: any;
  file?:
    | {
        url: string;
--- a/tools/actions.ts
+++ b/tools/actions.ts
@ -270,7 +270,7 @@ async function executeAction(action: Action) {
      tools = tools?.length ? tools : undefined;

      const response = await ask({
-        model: "gpt-4o-mini",
+        model: "gpt-4o",
        prompt: `You are an Action Executor.
      
      You are called to execute an action based on the provided instruction.
--- a/tools/ask.ts
+++ b/tools/ask.ts
@ -2,6 +2,7 @@ import OpenAI from "openai";
 import { saveApiUsage } from "../usage";
 import axios from "axios";
 import fs from "fs";
+import path from "path";
 import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
 import {
  ChatCompletion,
@ -132,13 +133,17 @@ export async function ask({
  name,
  tools,
  seed,
+  json,
+  image_url,
 }: {
  model?: string;
  prompt: string;
  message?: string;
+  image_url?: string;
  name?: string;
  tools?: RunnableToolFunctionWithParse<any>[];
  seed?: string;
+  json?: boolean;
 }): Promise<ChatCompletion> {
  // Initialize OpenAI instances
  const openai = new OpenAI({
@ -171,10 +176,24 @@ export async function ask({
      ...history,
      {
        role: "user",
-        content: message,
+        content: image_url
+          ? [
+              {
+                type: "text",
+                text: message,
+              },
+              {
+                type: "image_url",
+                image_url: {
+                  url: image_url,
+                },
+              },
+            ]
+          : message,
        name,
      },
    ];
+    console.log("got image:", image_url?.slice(0, 20));
  } else if (seed && !message) {
    // If seed is provided but no new message, just retrieve history
    const history = getMessageHistory(seed);
@ -189,7 +208,20 @@ export async function ask({
    // If no seed but message is provided, send system prompt and user message without history
    messages.push({
      role: "user",
-      content: message,
+      content: image_url
+        ? [
+            {
+              type: "text",
+              text: message,
+            },
+            {
+              type: "image_url",
+              image_url: {
+                url: image_url,
+              },
+            },
+          ]
+        : message,
      name,
    });
  }
@ -228,6 +260,7 @@ export async function ask({
        model,
        messages,
        tools,
+        response_format: json ? { type: "json_object" } : undefined,
      })
      .on("functionCall", (functionCall) => {
        send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`);
@ -293,15 +326,20 @@ export async function ask({
 const transcriptionCacheFile = pathInDataDir("transcription_cache.json");

 export async function get_transcription(
-  file_url: string,
+  input: string | File, // Accept either a file URL (string) or a File object
  binary?: boolean,
  key?: string
 ) {
+  // const openai = new OpenAI({
+  //   apiKey: ai_token,
+  // });
+
  const openai = new OpenAI({
-    apiKey: ai_token,
+    apiKey: groq_token,
+    baseURL: groq_baseurl,
  });

-  // Step 1: Check if the transcription for this file URL is already cached
+  // Step 1: Check if the transcription for this input (file_url or File) is already cached
  let transcriptionCache: Record<string, string> = {};

  // Try to read the cache file if it exists
@ -310,77 +348,91 @@ export async function get_transcription(
    transcriptionCache = JSON.parse(cacheData);
  }

-  if (binary) {
-    // If transcription for this file_url is already in the cache, return it
-    if (key && transcriptionCache[key]) {
-      console.log("Transcription found in cache:", transcriptionCache[key]);
-      return transcriptionCache[key];
+  let filePath: string;
+  let isAudio = false;
+  let fileExtension: string;
+
+  // Determine if the input is a File or URL and handle accordingly
+  if (input instanceof File) {
+    // Check the MIME type for audio validation
+    if (!input.type.startsWith("audio/")) {
+      throw new Error("The provided file is not an audio file.");
+    }
+    isAudio = true;
+
+    // Set file extension based on the MIME type
+    fileExtension = getExtensionFromMimeType(input.type) ?? "ogg";
+    if (!fileExtension) {
+      throw new Error(`Unsupported audio file type: ${input.type}`);
    }

-    const binaryData = Buffer.from(file_url, "base64");
-    // fs.writeFile("/home/audio_whats.ogg", binaryData, function (err) {});
+    // Write the file to the filesystem temporarily with the correct extension
+    filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
+    const buffer = await input.arrayBuffer();
+    fs.writeFileSync(filePath, new Uint8Array(buffer));
+  } else if (typeof input === "string") {
+    if (binary) {
+      // If input is binary data
+      const binaryData = Buffer.from(input, "base64");
+      if (key && transcriptionCache[key]) {
+        console.log("Transcription found in cache:", transcriptionCache[key]);
+        return transcriptionCache[key];
+      }
+      filePath = `/tmp/audio${Date.now()}.ogg`; // Default to .ogg for binary input
+      fs.writeFileSync(filePath, new Uint8Array(binaryData));
+    } else {
+      // Treat input as a file URL and extract the file extension
+      fileExtension = path.extname(input).slice(1).toLowerCase();
+      if (!["mp3", "ogg", "wav", "m4a"].includes(fileExtension)) {
+        throw new Error(
+          "The provided URL does not point to a valid audio file."
+        );
+      }
+      isAudio = true;

-    const filePath = `/tmp/audio${Date.now()}.ogg`;
+      // Step 2: Download the file from the URL
+      const response = await axios({
+        url: input,
+        method: "GET",
+        responseType: "stream",
+      });

-    fs.writeFileSync(filePath, new Uint8Array(binaryData));
+      filePath = `/tmp/audio${Date.now()}.${fileExtension}`;

-    // Step 3: Send the file to OpenAI's Whisper model
-    const transcription = await openai.audio.transcriptions.create({
-      model: "whisper-1",
-      file: fs.createReadStream(filePath),
-    });
+      // Save the downloaded file locally
+      const writer = fs.createWriteStream(filePath);
+      response.data.pipe(writer);

-    // Delete the temp file
-    fs.unlinkSync(filePath);
-
-    // Step 4: Save the transcription to the cache
-    key && (transcriptionCache[key] = transcription.text);
-    fs.writeFileSync(
-      transcriptionCacheFile,
-      JSON.stringify(transcriptionCache, null, 2)
+      await new Promise((resolve, reject) => {
+        writer.on("finish", resolve);
+        writer.on("error", reject);
+      });
+    }
+  } else {
+    throw new Error(
+      "Invalid input type. Must be either a file URL or a File object."
    );
-
-    console.log("Transcription:", transcription);
-
-    return transcription.text;
-  }
-
-  // If transcription for this file_url is already in the cache, return it
-  if (transcriptionCache[file_url]) {
-    console.log("Transcription found in cache:", transcriptionCache[file_url]);
-    return transcriptionCache[file_url];
  }

  try {
-    // Step 2: Download the file from the URL
-    const response = await axios({
-      url: file_url,
-      method: "GET",
-      responseType: "stream",
-    });
-
-    const filePath = `/tmp/audio${Date.now()}.ogg`;
-
-    // Save the downloaded file locally
-    const writer = fs.createWriteStream(filePath);
-    response.data.pipe(writer);
-
-    await new Promise((resolve, reject) => {
-      writer.on("finish", resolve);
-      writer.on("error", reject);
-    });
-
-    // Step 3: Send the file to OpenAI's Whisper model
+    // Step 3: Send the file to OpenAI's Whisper model for transcription
    const transcription = await openai.audio.transcriptions.create({
-      model: "whisper-1",
+      // model: "whisper-1",
+      model: "distil-whisper-large-v3-en",
      file: fs.createReadStream(filePath),
+      language: "en", // Optional
+      temperature: 0.0, // Optional
    });

    // Delete the temp file
    fs.unlinkSync(filePath);

    // Step 4: Save the transcription to the cache
-    transcriptionCache[file_url] = transcription.text;
+    if (key) {
+      transcriptionCache[key] = transcription.text;
+    } else if (typeof input === "string") {
+      transcriptionCache[input] = transcription.text;
+    }
    fs.writeFileSync(
      transcriptionCacheFile,
      JSON.stringify(transcriptionCache, null, 2)
@ -390,5 +442,20 @@ export async function get_transcription(
    return transcription.text;
  } catch (error) {
    console.error("Error transcribing audio:", error);
+    throw error;
  }
 }
+
+// Helper function to get file extension based on MIME type
+function getExtensionFromMimeType(mimeType: string): string | null {
+  const mimeTypesMap: Record<string, string> = {
+    "audio/mpeg": "mp3",
+    "audio/ogg": "ogg",
+    "audio/wav": "wav",
+    "audio/x-wav": "wav",
+    "audio/x-m4a": "m4a",
+    "audio/m4a": "m4a",
+    // Add other audio types as necessary
+  };
+  return mimeTypesMap[mimeType] || null;
+}
--- a/tools/communication.ts
+++ b/tools/communication.ts
@ -111,6 +111,7 @@ You can use the \`memory_manager\` tool to remember user preferences, such as wh

  const response = await ask({
    prompt,
+    model: "gpt-4o",
    message: `request: ${request}

    prefered_platform: ${prefered_platform}
--- a/tools/events.ts
+++ b/tools/events.ts
@ -9,7 +9,7 @@ import path from "path";
 import { discordAdapter } from "../interfaces";
 import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
 import { getTools, zodFunction } from ".";
-import { ask } from "./ask";
+import { ask, get_transcription } from "./ask";
 import { get_actions } from "./actions";
 import { pathInDataDir, userConfigs } from "../config";
 import { memory_manager_guide, memory_manager_init } from "./memory-manager";
@ -398,11 +398,52 @@ function registerListener(listener: EventListener) {

        const is_voice = listener.eventId === "on_voice_message";

+        let attached_image: string | undefined = undefined;
+
        if (is_voice) {
          tools = getTools(
            contextMessage.author.username,
            contextMessage
          ) as RunnableToolFunctionWithParse<any>[];
+
+          const audio = ((payload as any) ?? {}).transcription;
+          if (audio && audio instanceof File) {
+            if (audio.type.includes("audio")) {
+              console.log("Transcribing audio for voice event listener.");
+              (payload as any).transcription = await get_transcription(
+                audio as File
+              );
+            }
+          }
+
+          const otherContextData = (payload as any)?.other_context_data;
+
+          if (otherContextData instanceof File) {
+            if (otherContextData.type.includes("image")) {
+              // Read the file as a buffer
+              const buffer = await otherContextData.arrayBuffer();
+
+              // Convert the buffer to a base64 string
+              const base64Url = `data:${
+                otherContextData.type
+              };base64,${Buffer.from(buffer).toString("base64")}`;
+
+              // Create the object with base64 URL
+              const imageObject = {
+                type: "image_url",
+                image_url: {
+                  url: base64Url,
+                },
+              };
+
+              // Do something with imageObject, like sending it in a response or logging
+              attached_image = base64Url;
+            } else {
+              console.log("The provided file is not an image.");
+            }
+          } else {
+            console.log("No valid file provided in other_context_data.");
+          }
        }

        console.log("Running ASK for event listener: ", listener.description);
@ -471,12 +512,12 @@ function registerListener(listener: EventListener) {
          - Payload: ${JSON.stringify(payload, null, 2)}
          
          Follow the transcript provided in the payload.
-          Reply only in plain text without markdown or any other formatting.
+          
+          You response must be in plain text without markdown or any other formatting.
          `;

        if (system_prompts) {
          prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`;
-          // console.log("Voice system Prompt: ", prompt);
        }

        const response = !is_voice
@ -486,9 +527,10 @@ function registerListener(listener: EventListener) {
              tools,
            })
          : await ask({
-              model: "gpt-4o-mini",
+              model: attached_image ? "gpt-4o" : "gpt-4o-mini",
              prompt,
              message: voice_prompt,
+              image_url: attached_image,
              seed: `voice-anya-${listener.id}-${eventId}`,
              tools,
            });
@ -503,7 +545,11 @@ function registerListener(listener: EventListener) {
        }

        // Send a message to the user indicating the event was triggered
-        if (notify) await contextMessage.send({ content });
+        if (notify)
+          await contextMessage.send({
+            content,
+            flags: is_voice ? [4096] : undefined,
+          });
        else console.log("Silenced Notification: ", content);

        // Handle auto-stop options