From b2ad77e5b5408523c960385305aa59873c0a6e7b Mon Sep 17 00:00:00 2001 From: Raj Sharma Date: Tue, 8 Oct 2024 22:19:46 +0530 Subject: [PATCH] feat: support voice & image data as input to event --- interfaces/discord.ts | 16 ++-- interfaces/events.ts | 3 +- interfaces/message.ts | 2 + tools/actions.ts | 2 +- tools/ask.ts | 183 ++++++++++++++++++++++++++++------------- tools/communication.ts | 1 + tools/events.ts | 56 +++++++++++-- 7 files changed, 192 insertions(+), 71 deletions(-) diff --git a/interfaces/discord.ts b/interfaces/discord.ts index 474d02a..9849384 100644 --- a/interfaces/discord.ts +++ b/interfaces/discord.ts @@ -277,7 +277,9 @@ export class DiscordAdapter implements PlatformAdapter { isDirectMessage: async () => discordMessage.channel.type === ChannelType.DM, send: async (messageData) => { - const sentMessage = await discordMessage.channel.send(messageData); + const sentMessage = await (discordMessage.channel as TextChannel).send( + messageData + ); return this.convertSentMessage(sentMessage); }, reply: async (messageData) => { @@ -309,12 +311,12 @@ export class DiscordAdapter implements PlatformAdapter { return Promise.all(messages.map((msg) => this.convertMessage(msg))); }, sendFile: async (fileUrl, fileName) => { - await discordMessage.channel.send({ + await (discordMessage.channel as TextChannel).send({ files: [{ attachment: fileUrl, name: fileName }], }); }, sendTyping: async () => { - await discordMessage.channel.sendTyping(); + await (discordMessage.channel as TextChannel).sendTyping(); }, }; @@ -366,19 +368,21 @@ export class DiscordAdapter implements PlatformAdapter { return Promise.all(messages.map((msg) => this.convertMessage(msg))); }, sendFile: async (fileUrl, fileName) => { - await discordMessage.channel.send({ + await (discordMessage.channel as TextChannel).send({ files: [{ attachment: fileUrl, name: fileName }], }); }, sendTyping: async () => { - await discordMessage.channel.sendTyping(); + await (discordMessage.channel as TextChannel).sendTyping(); }, reply: async (messageData) => { const sentMessage = await discordMessage.reply(messageData); return this.convertSentMessage(sentMessage); }, send: async (messageData) => { - const sentMessage = await discordMessage.channel.send(messageData); + const sentMessage = await (discordMessage.channel as TextChannel).send( + messageData + ); return this.convertSentMessage(sentMessage); }, }; diff --git a/interfaces/events.ts b/interfaces/events.ts index 650d505..7c92b90 100644 --- a/interfaces/events.ts +++ b/interfaces/events.ts @@ -1,6 +1,7 @@ import { Elysia, t } from "elysia"; import { userConfigs } from "../config"; import { send_sys_log } from "./log"; +import { get_transcription } from "../tools/ask"; // Define the type for the event callback type EventCallback = ( @@ -187,7 +188,7 @@ export const events = new Elysia() body = textbody; } } - // console.log("Event received", body); + console.log("Event received", body); if (id === "ping") { send_sys_log(`Ping event received: ${JSON.stringify(body)}`); diff --git a/interfaces/message.ts b/interfaces/message.ts index 5ef32a8..9875a96 100644 --- a/interfaces/message.ts +++ b/interfaces/message.ts @@ -28,6 +28,8 @@ export interface Embed { export interface MessageData { content?: string; embeds?: Embed[]; + options?: any; + flags?: any; file?: | { url: string; diff --git a/tools/actions.ts b/tools/actions.ts index 470e085..49e4e9b 100644 --- a/tools/actions.ts +++ b/tools/actions.ts @@ -270,7 +270,7 @@ async function executeAction(action: Action) { tools = tools?.length ? tools : undefined; const response = await ask({ - model: "gpt-4o-mini", + model: "gpt-4o", prompt: `You are an Action Executor. You are called to execute an action based on the provided instruction. diff --git a/tools/ask.ts b/tools/ask.ts index cbd5316..8fef093 100644 --- a/tools/ask.ts +++ b/tools/ask.ts @@ -2,6 +2,7 @@ import OpenAI from "openai"; import { saveApiUsage } from "../usage"; import axios from "axios"; import fs from "fs"; +import path from "path"; import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs"; import { ChatCompletion, @@ -132,13 +133,17 @@ export async function ask({ name, tools, seed, + json, + image_url, }: { model?: string; prompt: string; message?: string; + image_url?: string; name?: string; tools?: RunnableToolFunctionWithParse[]; seed?: string; + json?: boolean; }): Promise { // Initialize OpenAI instances const openai = new OpenAI({ @@ -171,10 +176,24 @@ export async function ask({ ...history, { role: "user", - content: message, + content: image_url + ? [ + { + type: "text", + text: message, + }, + { + type: "image_url", + image_url: { + url: image_url, + }, + }, + ] + : message, name, }, ]; + console.log("got image:", image_url?.slice(0, 20)); } else if (seed && !message) { // If seed is provided but no new message, just retrieve history const history = getMessageHistory(seed); @@ -189,7 +208,20 @@ export async function ask({ // If no seed but message is provided, send system prompt and user message without history messages.push({ role: "user", - content: message, + content: image_url + ? [ + { + type: "text", + text: message, + }, + { + type: "image_url", + image_url: { + url: image_url, + }, + }, + ] + : message, name, }); } @@ -228,6 +260,7 @@ export async function ask({ model, messages, tools, + response_format: json ? { type: "json_object" } : undefined, }) .on("functionCall", (functionCall) => { send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`); @@ -293,15 +326,20 @@ export async function ask({ const transcriptionCacheFile = pathInDataDir("transcription_cache.json"); export async function get_transcription( - file_url: string, + input: string | File, // Accept either a file URL (string) or a File object binary?: boolean, key?: string ) { + // const openai = new OpenAI({ + // apiKey: ai_token, + // }); + const openai = new OpenAI({ - apiKey: ai_token, + apiKey: groq_token, + baseURL: groq_baseurl, }); - // Step 1: Check if the transcription for this file URL is already cached + // Step 1: Check if the transcription for this input (file_url or File) is already cached let transcriptionCache: Record = {}; // Try to read the cache file if it exists @@ -310,77 +348,91 @@ export async function get_transcription( transcriptionCache = JSON.parse(cacheData); } - if (binary) { - // If transcription for this file_url is already in the cache, return it - if (key && transcriptionCache[key]) { - console.log("Transcription found in cache:", transcriptionCache[key]); - return transcriptionCache[key]; + let filePath: string; + let isAudio = false; + let fileExtension: string; + + // Determine if the input is a File or URL and handle accordingly + if (input instanceof File) { + // Check the MIME type for audio validation + if (!input.type.startsWith("audio/")) { + throw new Error("The provided file is not an audio file."); + } + isAudio = true; + + // Set file extension based on the MIME type + fileExtension = getExtensionFromMimeType(input.type) ?? "ogg"; + if (!fileExtension) { + throw new Error(`Unsupported audio file type: ${input.type}`); } - const binaryData = Buffer.from(file_url, "base64"); - // fs.writeFile("/home/audio_whats.ogg", binaryData, function (err) {}); + // Write the file to the filesystem temporarily with the correct extension + filePath = `/tmp/audio${Date.now()}.${fileExtension}`; + const buffer = await input.arrayBuffer(); + fs.writeFileSync(filePath, new Uint8Array(buffer)); + } else if (typeof input === "string") { + if (binary) { + // If input is binary data + const binaryData = Buffer.from(input, "base64"); + if (key && transcriptionCache[key]) { + console.log("Transcription found in cache:", transcriptionCache[key]); + return transcriptionCache[key]; + } + filePath = `/tmp/audio${Date.now()}.ogg`; // Default to .ogg for binary input + fs.writeFileSync(filePath, new Uint8Array(binaryData)); + } else { + // Treat input as a file URL and extract the file extension + fileExtension = path.extname(input).slice(1).toLowerCase(); + if (!["mp3", "ogg", "wav", "m4a"].includes(fileExtension)) { + throw new Error( + "The provided URL does not point to a valid audio file." + ); + } + isAudio = true; - const filePath = `/tmp/audio${Date.now()}.ogg`; + // Step 2: Download the file from the URL + const response = await axios({ + url: input, + method: "GET", + responseType: "stream", + }); - fs.writeFileSync(filePath, new Uint8Array(binaryData)); + filePath = `/tmp/audio${Date.now()}.${fileExtension}`; - // Step 3: Send the file to OpenAI's Whisper model - const transcription = await openai.audio.transcriptions.create({ - model: "whisper-1", - file: fs.createReadStream(filePath), - }); + // Save the downloaded file locally + const writer = fs.createWriteStream(filePath); + response.data.pipe(writer); - // Delete the temp file - fs.unlinkSync(filePath); - - // Step 4: Save the transcription to the cache - key && (transcriptionCache[key] = transcription.text); - fs.writeFileSync( - transcriptionCacheFile, - JSON.stringify(transcriptionCache, null, 2) + await new Promise((resolve, reject) => { + writer.on("finish", resolve); + writer.on("error", reject); + }); + } + } else { + throw new Error( + "Invalid input type. Must be either a file URL or a File object." ); - - console.log("Transcription:", transcription); - - return transcription.text; - } - - // If transcription for this file_url is already in the cache, return it - if (transcriptionCache[file_url]) { - console.log("Transcription found in cache:", transcriptionCache[file_url]); - return transcriptionCache[file_url]; } try { - // Step 2: Download the file from the URL - const response = await axios({ - url: file_url, - method: "GET", - responseType: "stream", - }); - - const filePath = `/tmp/audio${Date.now()}.ogg`; - - // Save the downloaded file locally - const writer = fs.createWriteStream(filePath); - response.data.pipe(writer); - - await new Promise((resolve, reject) => { - writer.on("finish", resolve); - writer.on("error", reject); - }); - - // Step 3: Send the file to OpenAI's Whisper model + // Step 3: Send the file to OpenAI's Whisper model for transcription const transcription = await openai.audio.transcriptions.create({ - model: "whisper-1", + // model: "whisper-1", + model: "distil-whisper-large-v3-en", file: fs.createReadStream(filePath), + language: "en", // Optional + temperature: 0.0, // Optional }); // Delete the temp file fs.unlinkSync(filePath); // Step 4: Save the transcription to the cache - transcriptionCache[file_url] = transcription.text; + if (key) { + transcriptionCache[key] = transcription.text; + } else if (typeof input === "string") { + transcriptionCache[input] = transcription.text; + } fs.writeFileSync( transcriptionCacheFile, JSON.stringify(transcriptionCache, null, 2) @@ -390,5 +442,20 @@ export async function get_transcription( return transcription.text; } catch (error) { console.error("Error transcribing audio:", error); + throw error; } } + +// Helper function to get file extension based on MIME type +function getExtensionFromMimeType(mimeType: string): string | null { + const mimeTypesMap: Record = { + "audio/mpeg": "mp3", + "audio/ogg": "ogg", + "audio/wav": "wav", + "audio/x-wav": "wav", + "audio/x-m4a": "m4a", + "audio/m4a": "m4a", + // Add other audio types as necessary + }; + return mimeTypesMap[mimeType] || null; +} diff --git a/tools/communication.ts b/tools/communication.ts index 4b3ace7..a2bc429 100644 --- a/tools/communication.ts +++ b/tools/communication.ts @@ -111,6 +111,7 @@ You can use the \`memory_manager\` tool to remember user preferences, such as wh const response = await ask({ prompt, + model: "gpt-4o", message: `request: ${request} prefered_platform: ${prefered_platform} diff --git a/tools/events.ts b/tools/events.ts index f84cfb6..94207b2 100644 --- a/tools/events.ts +++ b/tools/events.ts @@ -9,7 +9,7 @@ import path from "path"; import { discordAdapter } from "../interfaces"; import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs"; import { getTools, zodFunction } from "."; -import { ask } from "./ask"; +import { ask, get_transcription } from "./ask"; import { get_actions } from "./actions"; import { pathInDataDir, userConfigs } from "../config"; import { memory_manager_guide, memory_manager_init } from "./memory-manager"; @@ -398,11 +398,52 @@ function registerListener(listener: EventListener) { const is_voice = listener.eventId === "on_voice_message"; + let attached_image: string | undefined = undefined; + if (is_voice) { tools = getTools( contextMessage.author.username, contextMessage ) as RunnableToolFunctionWithParse[]; + + const audio = ((payload as any) ?? {}).transcription; + if (audio && audio instanceof File) { + if (audio.type.includes("audio")) { + console.log("Transcribing audio for voice event listener."); + (payload as any).transcription = await get_transcription( + audio as File + ); + } + } + + const otherContextData = (payload as any)?.other_context_data; + + if (otherContextData instanceof File) { + if (otherContextData.type.includes("image")) { + // Read the file as a buffer + const buffer = await otherContextData.arrayBuffer(); + + // Convert the buffer to a base64 string + const base64Url = `data:${ + otherContextData.type + };base64,${Buffer.from(buffer).toString("base64")}`; + + // Create the object with base64 URL + const imageObject = { + type: "image_url", + image_url: { + url: base64Url, + }, + }; + + // Do something with imageObject, like sending it in a response or logging + attached_image = base64Url; + } else { + console.log("The provided file is not an image."); + } + } else { + console.log("No valid file provided in other_context_data."); + } } console.log("Running ASK for event listener: ", listener.description); @@ -471,12 +512,12 @@ function registerListener(listener: EventListener) { - Payload: ${JSON.stringify(payload, null, 2)} Follow the transcript provided in the payload. - Reply only in plain text without markdown or any other formatting. + + You response must be in plain text without markdown or any other formatting. `; if (system_prompts) { prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`; - // console.log("Voice system Prompt: ", prompt); } const response = !is_voice @@ -486,9 +527,10 @@ function registerListener(listener: EventListener) { tools, }) : await ask({ - model: "gpt-4o-mini", + model: attached_image ? "gpt-4o" : "gpt-4o-mini", prompt, message: voice_prompt, + image_url: attached_image, seed: `voice-anya-${listener.id}-${eventId}`, tools, }); @@ -503,7 +545,11 @@ function registerListener(listener: EventListener) { } // Send a message to the user indicating the event was triggered - if (notify) await contextMessage.send({ content }); + if (notify) + await contextMessage.send({ + content, + flags: is_voice ? [4096] : undefined, + }); else console.log("Silenced Notification: ", content); // Handle auto-stop options