From b2ad77e5b5408523c960385305aa59873c0a6e7b Mon Sep 17 00:00:00 2001
From: Raj Sharma <raj.fps2000@gmail.com>
Date: Tue, 8 Oct 2024 22:19:46 +0530
Subject: [PATCH] feat: support voice & image data as input to event

---
 interfaces/discord.ts  |  16 ++--
 interfaces/events.ts   |   3 +-
 interfaces/message.ts  |   2 +
 tools/actions.ts       |   2 +-
 tools/ask.ts           | 183 ++++++++++++++++++++++++++++-------------
 tools/communication.ts |   1 +
 tools/events.ts        |  56 +++++++++++--
 7 files changed, 192 insertions(+), 71 deletions(-)

diff --git a/interfaces/discord.ts b/interfaces/discord.ts
index 474d02a..9849384 100644
--- a/interfaces/discord.ts
+++ b/interfaces/discord.ts
@@ -277,7 +277,9 @@ export class DiscordAdapter implements PlatformAdapter {
       isDirectMessage: async () =>
         discordMessage.channel.type === ChannelType.DM,
       send: async (messageData) => {
-        const sentMessage = await discordMessage.channel.send(messageData);
+        const sentMessage = await (discordMessage.channel as TextChannel).send(
+          messageData
+        );
         return this.convertSentMessage(sentMessage);
       },
       reply: async (messageData) => {
@@ -309,12 +311,12 @@ export class DiscordAdapter implements PlatformAdapter {
         return Promise.all(messages.map((msg) => this.convertMessage(msg)));
       },
       sendFile: async (fileUrl, fileName) => {
-        await discordMessage.channel.send({
+        await (discordMessage.channel as TextChannel).send({
           files: [{ attachment: fileUrl, name: fileName }],
         });
       },
       sendTyping: async () => {
-        await discordMessage.channel.sendTyping();
+        await (discordMessage.channel as TextChannel).sendTyping();
       },
     };
 
@@ -366,19 +368,21 @@ export class DiscordAdapter implements PlatformAdapter {
         return Promise.all(messages.map((msg) => this.convertMessage(msg)));
       },
       sendFile: async (fileUrl, fileName) => {
-        await discordMessage.channel.send({
+        await (discordMessage.channel as TextChannel).send({
           files: [{ attachment: fileUrl, name: fileName }],
         });
       },
       sendTyping: async () => {
-        await discordMessage.channel.sendTyping();
+        await (discordMessage.channel as TextChannel).sendTyping();
       },
       reply: async (messageData) => {
         const sentMessage = await discordMessage.reply(messageData);
         return this.convertSentMessage(sentMessage);
       },
       send: async (messageData) => {
-        const sentMessage = await discordMessage.channel.send(messageData);
+        const sentMessage = await (discordMessage.channel as TextChannel).send(
+          messageData
+        );
         return this.convertSentMessage(sentMessage);
       },
     };
diff --git a/interfaces/events.ts b/interfaces/events.ts
index 650d505..7c92b90 100644
--- a/interfaces/events.ts
+++ b/interfaces/events.ts
@@ -1,6 +1,7 @@
 import { Elysia, t } from "elysia";
 import { userConfigs } from "../config";
 import { send_sys_log } from "./log";
+import { get_transcription } from "../tools/ask";
 
 // Define the type for the event callback
 type EventCallback = (
@@ -187,7 +188,7 @@ export const events = new Elysia()
           body = textbody;
         }
       }
-      // console.log("Event received", body);
+      console.log("Event received", body);
 
       if (id === "ping") {
         send_sys_log(`Ping event received: ${JSON.stringify(body)}`);
diff --git a/interfaces/message.ts b/interfaces/message.ts
index 5ef32a8..9875a96 100644
--- a/interfaces/message.ts
+++ b/interfaces/message.ts
@@ -28,6 +28,8 @@ export interface Embed {
 export interface MessageData {
   content?: string;
   embeds?: Embed[];
+  options?: any;
+  flags?: any;
   file?:
     | {
         url: string;
diff --git a/tools/actions.ts b/tools/actions.ts
index 470e085..49e4e9b 100644
--- a/tools/actions.ts
+++ b/tools/actions.ts
@@ -270,7 +270,7 @@ async function executeAction(action: Action) {
       tools = tools?.length ? tools : undefined;
 
       const response = await ask({
-        model: "gpt-4o-mini",
+        model: "gpt-4o",
         prompt: `You are an Action Executor.
       
       You are called to execute an action based on the provided instruction.
diff --git a/tools/ask.ts b/tools/ask.ts
index cbd5316..8fef093 100644
--- a/tools/ask.ts
+++ b/tools/ask.ts
@@ -2,6 +2,7 @@ import OpenAI from "openai";
 import { saveApiUsage } from "../usage";
 import axios from "axios";
 import fs from "fs";
+import path from "path";
 import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
 import {
   ChatCompletion,
@@ -132,13 +133,17 @@ export async function ask({
   name,
   tools,
   seed,
+  json,
+  image_url,
 }: {
   model?: string;
   prompt: string;
   message?: string;
+  image_url?: string;
   name?: string;
   tools?: RunnableToolFunctionWithParse<any>[];
   seed?: string;
+  json?: boolean;
 }): Promise<ChatCompletion> {
   // Initialize OpenAI instances
   const openai = new OpenAI({
@@ -171,10 +176,24 @@ export async function ask({
       ...history,
       {
         role: "user",
-        content: message,
+        content: image_url
+          ? [
+              {
+                type: "text",
+                text: message,
+              },
+              {
+                type: "image_url",
+                image_url: {
+                  url: image_url,
+                },
+              },
+            ]
+          : message,
         name,
       },
     ];
+    console.log("got image:", image_url?.slice(0, 20));
   } else if (seed && !message) {
     // If seed is provided but no new message, just retrieve history
     const history = getMessageHistory(seed);
@@ -189,7 +208,20 @@ export async function ask({
     // If no seed but message is provided, send system prompt and user message without history
     messages.push({
       role: "user",
-      content: message,
+      content: image_url
+        ? [
+            {
+              type: "text",
+              text: message,
+            },
+            {
+              type: "image_url",
+              image_url: {
+                url: image_url,
+              },
+            },
+          ]
+        : message,
       name,
     });
   }
@@ -228,6 +260,7 @@ export async function ask({
         model,
         messages,
         tools,
+        response_format: json ? { type: "json_object" } : undefined,
       })
       .on("functionCall", (functionCall) => {
         send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`);
@@ -293,15 +326,20 @@ export async function ask({
 const transcriptionCacheFile = pathInDataDir("transcription_cache.json");
 
 export async function get_transcription(
-  file_url: string,
+  input: string | File, // Accept either a file URL (string) or a File object
   binary?: boolean,
   key?: string
 ) {
+  // const openai = new OpenAI({
+  //   apiKey: ai_token,
+  // });
+
   const openai = new OpenAI({
-    apiKey: ai_token,
+    apiKey: groq_token,
+    baseURL: groq_baseurl,
   });
 
-  // Step 1: Check if the transcription for this file URL is already cached
+  // Step 1: Check if the transcription for this input (file_url or File) is already cached
   let transcriptionCache: Record<string, string> = {};
 
   // Try to read the cache file if it exists
@@ -310,77 +348,91 @@ export async function get_transcription(
     transcriptionCache = JSON.parse(cacheData);
   }
 
-  if (binary) {
-    // If transcription for this file_url is already in the cache, return it
-    if (key && transcriptionCache[key]) {
-      console.log("Transcription found in cache:", transcriptionCache[key]);
-      return transcriptionCache[key];
+  let filePath: string;
+  let isAudio = false;
+  let fileExtension: string;
+
+  // Determine if the input is a File or URL and handle accordingly
+  if (input instanceof File) {
+    // Check the MIME type for audio validation
+    if (!input.type.startsWith("audio/")) {
+      throw new Error("The provided file is not an audio file.");
+    }
+    isAudio = true;
+
+    // Set file extension based on the MIME type
+    fileExtension = getExtensionFromMimeType(input.type) ?? "ogg";
+    if (!fileExtension) {
+      throw new Error(`Unsupported audio file type: ${input.type}`);
     }
 
-    const binaryData = Buffer.from(file_url, "base64");
-    // fs.writeFile("/home/audio_whats.ogg", binaryData, function (err) {});
+    // Write the file to the filesystem temporarily with the correct extension
+    filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
+    const buffer = await input.arrayBuffer();
+    fs.writeFileSync(filePath, new Uint8Array(buffer));
+  } else if (typeof input === "string") {
+    if (binary) {
+      // If input is binary data
+      const binaryData = Buffer.from(input, "base64");
+      if (key && transcriptionCache[key]) {
+        console.log("Transcription found in cache:", transcriptionCache[key]);
+        return transcriptionCache[key];
+      }
+      filePath = `/tmp/audio${Date.now()}.ogg`; // Default to .ogg for binary input
+      fs.writeFileSync(filePath, new Uint8Array(binaryData));
+    } else {
+      // Treat input as a file URL and extract the file extension
+      fileExtension = path.extname(input).slice(1).toLowerCase();
+      if (!["mp3", "ogg", "wav", "m4a"].includes(fileExtension)) {
+        throw new Error(
+          "The provided URL does not point to a valid audio file."
+        );
+      }
+      isAudio = true;
 
-    const filePath = `/tmp/audio${Date.now()}.ogg`;
+      // Step 2: Download the file from the URL
+      const response = await axios({
+        url: input,
+        method: "GET",
+        responseType: "stream",
+      });
 
-    fs.writeFileSync(filePath, new Uint8Array(binaryData));
+      filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
 
-    // Step 3: Send the file to OpenAI's Whisper model
-    const transcription = await openai.audio.transcriptions.create({
-      model: "whisper-1",
-      file: fs.createReadStream(filePath),
-    });
+      // Save the downloaded file locally
+      const writer = fs.createWriteStream(filePath);
+      response.data.pipe(writer);
 
-    // Delete the temp file
-    fs.unlinkSync(filePath);
-
-    // Step 4: Save the transcription to the cache
-    key && (transcriptionCache[key] = transcription.text);
-    fs.writeFileSync(
-      transcriptionCacheFile,
-      JSON.stringify(transcriptionCache, null, 2)
+      await new Promise((resolve, reject) => {
+        writer.on("finish", resolve);
+        writer.on("error", reject);
+      });
+    }
+  } else {
+    throw new Error(
+      "Invalid input type. Must be either a file URL or a File object."
     );
-
-    console.log("Transcription:", transcription);
-
-    return transcription.text;
-  }
-
-  // If transcription for this file_url is already in the cache, return it
-  if (transcriptionCache[file_url]) {
-    console.log("Transcription found in cache:", transcriptionCache[file_url]);
-    return transcriptionCache[file_url];
   }
 
   try {
-    // Step 2: Download the file from the URL
-    const response = await axios({
-      url: file_url,
-      method: "GET",
-      responseType: "stream",
-    });
-
-    const filePath = `/tmp/audio${Date.now()}.ogg`;
-
-    // Save the downloaded file locally
-    const writer = fs.createWriteStream(filePath);
-    response.data.pipe(writer);
-
-    await new Promise((resolve, reject) => {
-      writer.on("finish", resolve);
-      writer.on("error", reject);
-    });
-
-    // Step 3: Send the file to OpenAI's Whisper model
+    // Step 3: Send the file to OpenAI's Whisper model for transcription
     const transcription = await openai.audio.transcriptions.create({
-      model: "whisper-1",
+      // model: "whisper-1",
+      model: "distil-whisper-large-v3-en",
       file: fs.createReadStream(filePath),
+      language: "en", // Optional
+      temperature: 0.0, // Optional
     });
 
     // Delete the temp file
     fs.unlinkSync(filePath);
 
     // Step 4: Save the transcription to the cache
-    transcriptionCache[file_url] = transcription.text;
+    if (key) {
+      transcriptionCache[key] = transcription.text;
+    } else if (typeof input === "string") {
+      transcriptionCache[input] = transcription.text;
+    }
     fs.writeFileSync(
       transcriptionCacheFile,
       JSON.stringify(transcriptionCache, null, 2)
@@ -390,5 +442,20 @@ export async function get_transcription(
     return transcription.text;
   } catch (error) {
     console.error("Error transcribing audio:", error);
+    throw error;
   }
 }
+
+// Helper function to get file extension based on MIME type
+function getExtensionFromMimeType(mimeType: string): string | null {
+  const mimeTypesMap: Record<string, string> = {
+    "audio/mpeg": "mp3",
+    "audio/ogg": "ogg",
+    "audio/wav": "wav",
+    "audio/x-wav": "wav",
+    "audio/x-m4a": "m4a",
+    "audio/m4a": "m4a",
+    // Add other audio types as necessary
+  };
+  return mimeTypesMap[mimeType] || null;
+}
diff --git a/tools/communication.ts b/tools/communication.ts
index 4b3ace7..a2bc429 100644
--- a/tools/communication.ts
+++ b/tools/communication.ts
@@ -111,6 +111,7 @@ You can use the \`memory_manager\` tool to remember user preferences, such as wh
 
   const response = await ask({
     prompt,
+    model: "gpt-4o",
     message: `request: ${request}
 
     prefered_platform: ${prefered_platform}
diff --git a/tools/events.ts b/tools/events.ts
index f84cfb6..94207b2 100644
--- a/tools/events.ts
+++ b/tools/events.ts
@@ -9,7 +9,7 @@ import path from "path";
 import { discordAdapter } from "../interfaces";
 import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
 import { getTools, zodFunction } from ".";
-import { ask } from "./ask";
+import { ask, get_transcription } from "./ask";
 import { get_actions } from "./actions";
 import { pathInDataDir, userConfigs } from "../config";
 import { memory_manager_guide, memory_manager_init } from "./memory-manager";
@@ -398,11 +398,52 @@ function registerListener(listener: EventListener) {
 
         const is_voice = listener.eventId === "on_voice_message";
 
+        let attached_image: string | undefined = undefined;
+
         if (is_voice) {
           tools = getTools(
             contextMessage.author.username,
             contextMessage
           ) as RunnableToolFunctionWithParse<any>[];
+
+          const audio = ((payload as any) ?? {}).transcription;
+          if (audio && audio instanceof File) {
+            if (audio.type.includes("audio")) {
+              console.log("Transcribing audio for voice event listener.");
+              (payload as any).transcription = await get_transcription(
+                audio as File
+              );
+            }
+          }
+
+          const otherContextData = (payload as any)?.other_context_data;
+
+          if (otherContextData instanceof File) {
+            if (otherContextData.type.includes("image")) {
+              // Read the file as a buffer
+              const buffer = await otherContextData.arrayBuffer();
+
+              // Convert the buffer to a base64 string
+              const base64Url = `data:${
+                otherContextData.type
+              };base64,${Buffer.from(buffer).toString("base64")}`;
+
+              // Create the object with base64 URL
+              const imageObject = {
+                type: "image_url",
+                image_url: {
+                  url: base64Url,
+                },
+              };
+
+              // Do something with imageObject, like sending it in a response or logging
+              attached_image = base64Url;
+            } else {
+              console.log("The provided file is not an image.");
+            }
+          } else {
+            console.log("No valid file provided in other_context_data.");
+          }
         }
 
         console.log("Running ASK for event listener: ", listener.description);
@@ -471,12 +512,12 @@ function registerListener(listener: EventListener) {
           - Payload: ${JSON.stringify(payload, null, 2)}
           
           Follow the transcript provided in the payload.
-          Reply only in plain text without markdown or any other formatting.
+          
+          You response must be in plain text without markdown or any other formatting.
           `;
 
         if (system_prompts) {
           prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`;
-          // console.log("Voice system Prompt: ", prompt);
         }
 
         const response = !is_voice
@@ -486,9 +527,10 @@ function registerListener(listener: EventListener) {
               tools,
             })
           : await ask({
-              model: "gpt-4o-mini",
+              model: attached_image ? "gpt-4o" : "gpt-4o-mini",
               prompt,
               message: voice_prompt,
+              image_url: attached_image,
               seed: `voice-anya-${listener.id}-${eventId}`,
               tools,
             });
@@ -503,7 +545,11 @@ function registerListener(listener: EventListener) {
         }
 
         // Send a message to the user indicating the event was triggered
-        if (notify) await contextMessage.send({ content });
+        if (notify)
+          await contextMessage.send({
+            content,
+            flags: is_voice ? [4096] : undefined,
+          });
         else console.log("Silenced Notification: ", content);
 
         // Handle auto-stop options