feat: support voice & image data as input to event
This commit is contained in:
parent
97add456a7
commit
b2ad77e5b5
|
@ -277,7 +277,9 @@ export class DiscordAdapter implements PlatformAdapter {
|
|||
isDirectMessage: async () =>
|
||||
discordMessage.channel.type === ChannelType.DM,
|
||||
send: async (messageData) => {
|
||||
const sentMessage = await discordMessage.channel.send(messageData);
|
||||
const sentMessage = await (discordMessage.channel as TextChannel).send(
|
||||
messageData
|
||||
);
|
||||
return this.convertSentMessage(sentMessage);
|
||||
},
|
||||
reply: async (messageData) => {
|
||||
|
@ -309,12 +311,12 @@ export class DiscordAdapter implements PlatformAdapter {
|
|||
return Promise.all(messages.map((msg) => this.convertMessage(msg)));
|
||||
},
|
||||
sendFile: async (fileUrl, fileName) => {
|
||||
await discordMessage.channel.send({
|
||||
await (discordMessage.channel as TextChannel).send({
|
||||
files: [{ attachment: fileUrl, name: fileName }],
|
||||
});
|
||||
},
|
||||
sendTyping: async () => {
|
||||
await discordMessage.channel.sendTyping();
|
||||
await (discordMessage.channel as TextChannel).sendTyping();
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -366,19 +368,21 @@ export class DiscordAdapter implements PlatformAdapter {
|
|||
return Promise.all(messages.map((msg) => this.convertMessage(msg)));
|
||||
},
|
||||
sendFile: async (fileUrl, fileName) => {
|
||||
await discordMessage.channel.send({
|
||||
await (discordMessage.channel as TextChannel).send({
|
||||
files: [{ attachment: fileUrl, name: fileName }],
|
||||
});
|
||||
},
|
||||
sendTyping: async () => {
|
||||
await discordMessage.channel.sendTyping();
|
||||
await (discordMessage.channel as TextChannel).sendTyping();
|
||||
},
|
||||
reply: async (messageData) => {
|
||||
const sentMessage = await discordMessage.reply(messageData);
|
||||
return this.convertSentMessage(sentMessage);
|
||||
},
|
||||
send: async (messageData) => {
|
||||
const sentMessage = await discordMessage.channel.send(messageData);
|
||||
const sentMessage = await (discordMessage.channel as TextChannel).send(
|
||||
messageData
|
||||
);
|
||||
return this.convertSentMessage(sentMessage);
|
||||
},
|
||||
};
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import { Elysia, t } from "elysia";
|
||||
import { userConfigs } from "../config";
|
||||
import { send_sys_log } from "./log";
|
||||
import { get_transcription } from "../tools/ask";
|
||||
|
||||
// Define the type for the event callback
|
||||
type EventCallback = (
|
||||
|
@ -187,7 +188,7 @@ export const events = new Elysia()
|
|||
body = textbody;
|
||||
}
|
||||
}
|
||||
// console.log("Event received", body);
|
||||
console.log("Event received", body);
|
||||
|
||||
if (id === "ping") {
|
||||
send_sys_log(`Ping event received: ${JSON.stringify(body)}`);
|
||||
|
|
|
@ -28,6 +28,8 @@ export interface Embed {
|
|||
export interface MessageData {
|
||||
content?: string;
|
||||
embeds?: Embed[];
|
||||
options?: any;
|
||||
flags?: any;
|
||||
file?:
|
||||
| {
|
||||
url: string;
|
||||
|
|
|
@ -270,7 +270,7 @@ async function executeAction(action: Action) {
|
|||
tools = tools?.length ? tools : undefined;
|
||||
|
||||
const response = await ask({
|
||||
model: "gpt-4o-mini",
|
||||
model: "gpt-4o",
|
||||
prompt: `You are an Action Executor.
|
||||
|
||||
You are called to execute an action based on the provided instruction.
|
||||
|
|
183
tools/ask.ts
183
tools/ask.ts
|
@ -2,6 +2,7 @@ import OpenAI from "openai";
|
|||
import { saveApiUsage } from "../usage";
|
||||
import axios from "axios";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
|
||||
import {
|
||||
ChatCompletion,
|
||||
|
@ -132,13 +133,17 @@ export async function ask({
|
|||
name,
|
||||
tools,
|
||||
seed,
|
||||
json,
|
||||
image_url,
|
||||
}: {
|
||||
model?: string;
|
||||
prompt: string;
|
||||
message?: string;
|
||||
image_url?: string;
|
||||
name?: string;
|
||||
tools?: RunnableToolFunctionWithParse<any>[];
|
||||
seed?: string;
|
||||
json?: boolean;
|
||||
}): Promise<ChatCompletion> {
|
||||
// Initialize OpenAI instances
|
||||
const openai = new OpenAI({
|
||||
|
@ -171,10 +176,24 @@ export async function ask({
|
|||
...history,
|
||||
{
|
||||
role: "user",
|
||||
content: message,
|
||||
content: image_url
|
||||
? [
|
||||
{
|
||||
type: "text",
|
||||
text: message,
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: image_url,
|
||||
},
|
||||
},
|
||||
]
|
||||
: message,
|
||||
name,
|
||||
},
|
||||
];
|
||||
console.log("got image:", image_url?.slice(0, 20));
|
||||
} else if (seed && !message) {
|
||||
// If seed is provided but no new message, just retrieve history
|
||||
const history = getMessageHistory(seed);
|
||||
|
@ -189,7 +208,20 @@ export async function ask({
|
|||
// If no seed but message is provided, send system prompt and user message without history
|
||||
messages.push({
|
||||
role: "user",
|
||||
content: message,
|
||||
content: image_url
|
||||
? [
|
||||
{
|
||||
type: "text",
|
||||
text: message,
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: image_url,
|
||||
},
|
||||
},
|
||||
]
|
||||
: message,
|
||||
name,
|
||||
});
|
||||
}
|
||||
|
@ -228,6 +260,7 @@ export async function ask({
|
|||
model,
|
||||
messages,
|
||||
tools,
|
||||
response_format: json ? { type: "json_object" } : undefined,
|
||||
})
|
||||
.on("functionCall", (functionCall) => {
|
||||
send_sys_log(`ASK Function call: ${JSON.stringify(functionCall)}`);
|
||||
|
@ -293,15 +326,20 @@ export async function ask({
|
|||
const transcriptionCacheFile = pathInDataDir("transcription_cache.json");
|
||||
|
||||
export async function get_transcription(
|
||||
file_url: string,
|
||||
input: string | File, // Accept either a file URL (string) or a File object
|
||||
binary?: boolean,
|
||||
key?: string
|
||||
) {
|
||||
// const openai = new OpenAI({
|
||||
// apiKey: ai_token,
|
||||
// });
|
||||
|
||||
const openai = new OpenAI({
|
||||
apiKey: ai_token,
|
||||
apiKey: groq_token,
|
||||
baseURL: groq_baseurl,
|
||||
});
|
||||
|
||||
// Step 1: Check if the transcription for this file URL is already cached
|
||||
// Step 1: Check if the transcription for this input (file_url or File) is already cached
|
||||
let transcriptionCache: Record<string, string> = {};
|
||||
|
||||
// Try to read the cache file if it exists
|
||||
|
@ -310,77 +348,91 @@ export async function get_transcription(
|
|||
transcriptionCache = JSON.parse(cacheData);
|
||||
}
|
||||
|
||||
if (binary) {
|
||||
// If transcription for this file_url is already in the cache, return it
|
||||
if (key && transcriptionCache[key]) {
|
||||
console.log("Transcription found in cache:", transcriptionCache[key]);
|
||||
return transcriptionCache[key];
|
||||
let filePath: string;
|
||||
let isAudio = false;
|
||||
let fileExtension: string;
|
||||
|
||||
// Determine if the input is a File or URL and handle accordingly
|
||||
if (input instanceof File) {
|
||||
// Check the MIME type for audio validation
|
||||
if (!input.type.startsWith("audio/")) {
|
||||
throw new Error("The provided file is not an audio file.");
|
||||
}
|
||||
isAudio = true;
|
||||
|
||||
// Set file extension based on the MIME type
|
||||
fileExtension = getExtensionFromMimeType(input.type) ?? "ogg";
|
||||
if (!fileExtension) {
|
||||
throw new Error(`Unsupported audio file type: ${input.type}`);
|
||||
}
|
||||
|
||||
const binaryData = Buffer.from(file_url, "base64");
|
||||
// fs.writeFile("/home/audio_whats.ogg", binaryData, function (err) {});
|
||||
// Write the file to the filesystem temporarily with the correct extension
|
||||
filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
|
||||
const buffer = await input.arrayBuffer();
|
||||
fs.writeFileSync(filePath, new Uint8Array(buffer));
|
||||
} else if (typeof input === "string") {
|
||||
if (binary) {
|
||||
// If input is binary data
|
||||
const binaryData = Buffer.from(input, "base64");
|
||||
if (key && transcriptionCache[key]) {
|
||||
console.log("Transcription found in cache:", transcriptionCache[key]);
|
||||
return transcriptionCache[key];
|
||||
}
|
||||
filePath = `/tmp/audio${Date.now()}.ogg`; // Default to .ogg for binary input
|
||||
fs.writeFileSync(filePath, new Uint8Array(binaryData));
|
||||
} else {
|
||||
// Treat input as a file URL and extract the file extension
|
||||
fileExtension = path.extname(input).slice(1).toLowerCase();
|
||||
if (!["mp3", "ogg", "wav", "m4a"].includes(fileExtension)) {
|
||||
throw new Error(
|
||||
"The provided URL does not point to a valid audio file."
|
||||
);
|
||||
}
|
||||
isAudio = true;
|
||||
|
||||
const filePath = `/tmp/audio${Date.now()}.ogg`;
|
||||
// Step 2: Download the file from the URL
|
||||
const response = await axios({
|
||||
url: input,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
fs.writeFileSync(filePath, new Uint8Array(binaryData));
|
||||
filePath = `/tmp/audio${Date.now()}.${fileExtension}`;
|
||||
|
||||
// Step 3: Send the file to OpenAI's Whisper model
|
||||
const transcription = await openai.audio.transcriptions.create({
|
||||
model: "whisper-1",
|
||||
file: fs.createReadStream(filePath),
|
||||
});
|
||||
// Save the downloaded file locally
|
||||
const writer = fs.createWriteStream(filePath);
|
||||
response.data.pipe(writer);
|
||||
|
||||
// Delete the temp file
|
||||
fs.unlinkSync(filePath);
|
||||
|
||||
// Step 4: Save the transcription to the cache
|
||||
key && (transcriptionCache[key] = transcription.text);
|
||||
fs.writeFileSync(
|
||||
transcriptionCacheFile,
|
||||
JSON.stringify(transcriptionCache, null, 2)
|
||||
await new Promise((resolve, reject) => {
|
||||
writer.on("finish", resolve);
|
||||
writer.on("error", reject);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
throw new Error(
|
||||
"Invalid input type. Must be either a file URL or a File object."
|
||||
);
|
||||
|
||||
console.log("Transcription:", transcription);
|
||||
|
||||
return transcription.text;
|
||||
}
|
||||
|
||||
// If transcription for this file_url is already in the cache, return it
|
||||
if (transcriptionCache[file_url]) {
|
||||
console.log("Transcription found in cache:", transcriptionCache[file_url]);
|
||||
return transcriptionCache[file_url];
|
||||
}
|
||||
|
||||
try {
|
||||
// Step 2: Download the file from the URL
|
||||
const response = await axios({
|
||||
url: file_url,
|
||||
method: "GET",
|
||||
responseType: "stream",
|
||||
});
|
||||
|
||||
const filePath = `/tmp/audio${Date.now()}.ogg`;
|
||||
|
||||
// Save the downloaded file locally
|
||||
const writer = fs.createWriteStream(filePath);
|
||||
response.data.pipe(writer);
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
writer.on("finish", resolve);
|
||||
writer.on("error", reject);
|
||||
});
|
||||
|
||||
// Step 3: Send the file to OpenAI's Whisper model
|
||||
// Step 3: Send the file to OpenAI's Whisper model for transcription
|
||||
const transcription = await openai.audio.transcriptions.create({
|
||||
model: "whisper-1",
|
||||
// model: "whisper-1",
|
||||
model: "distil-whisper-large-v3-en",
|
||||
file: fs.createReadStream(filePath),
|
||||
language: "en", // Optional
|
||||
temperature: 0.0, // Optional
|
||||
});
|
||||
|
||||
// Delete the temp file
|
||||
fs.unlinkSync(filePath);
|
||||
|
||||
// Step 4: Save the transcription to the cache
|
||||
transcriptionCache[file_url] = transcription.text;
|
||||
if (key) {
|
||||
transcriptionCache[key] = transcription.text;
|
||||
} else if (typeof input === "string") {
|
||||
transcriptionCache[input] = transcription.text;
|
||||
}
|
||||
fs.writeFileSync(
|
||||
transcriptionCacheFile,
|
||||
JSON.stringify(transcriptionCache, null, 2)
|
||||
|
@ -390,5 +442,20 @@ export async function get_transcription(
|
|||
return transcription.text;
|
||||
} catch (error) {
|
||||
console.error("Error transcribing audio:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to get file extension based on MIME type
|
||||
function getExtensionFromMimeType(mimeType: string): string | null {
|
||||
const mimeTypesMap: Record<string, string> = {
|
||||
"audio/mpeg": "mp3",
|
||||
"audio/ogg": "ogg",
|
||||
"audio/wav": "wav",
|
||||
"audio/x-wav": "wav",
|
||||
"audio/x-m4a": "m4a",
|
||||
"audio/m4a": "m4a",
|
||||
// Add other audio types as necessary
|
||||
};
|
||||
return mimeTypesMap[mimeType] || null;
|
||||
}
|
||||
|
|
|
@ -111,6 +111,7 @@ You can use the \`memory_manager\` tool to remember user preferences, such as wh
|
|||
|
||||
const response = await ask({
|
||||
prompt,
|
||||
model: "gpt-4o",
|
||||
message: `request: ${request}
|
||||
|
||||
prefered_platform: ${prefered_platform}
|
||||
|
|
|
@ -9,7 +9,7 @@ import path from "path";
|
|||
import { discordAdapter } from "../interfaces";
|
||||
import { RunnableToolFunctionWithParse } from "openai/lib/RunnableFunction.mjs";
|
||||
import { getTools, zodFunction } from ".";
|
||||
import { ask } from "./ask";
|
||||
import { ask, get_transcription } from "./ask";
|
||||
import { get_actions } from "./actions";
|
||||
import { pathInDataDir, userConfigs } from "../config";
|
||||
import { memory_manager_guide, memory_manager_init } from "./memory-manager";
|
||||
|
@ -398,11 +398,52 @@ function registerListener(listener: EventListener) {
|
|||
|
||||
const is_voice = listener.eventId === "on_voice_message";
|
||||
|
||||
let attached_image: string | undefined = undefined;
|
||||
|
||||
if (is_voice) {
|
||||
tools = getTools(
|
||||
contextMessage.author.username,
|
||||
contextMessage
|
||||
) as RunnableToolFunctionWithParse<any>[];
|
||||
|
||||
const audio = ((payload as any) ?? {}).transcription;
|
||||
if (audio && audio instanceof File) {
|
||||
if (audio.type.includes("audio")) {
|
||||
console.log("Transcribing audio for voice event listener.");
|
||||
(payload as any).transcription = await get_transcription(
|
||||
audio as File
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const otherContextData = (payload as any)?.other_context_data;
|
||||
|
||||
if (otherContextData instanceof File) {
|
||||
if (otherContextData.type.includes("image")) {
|
||||
// Read the file as a buffer
|
||||
const buffer = await otherContextData.arrayBuffer();
|
||||
|
||||
// Convert the buffer to a base64 string
|
||||
const base64Url = `data:${
|
||||
otherContextData.type
|
||||
};base64,${Buffer.from(buffer).toString("base64")}`;
|
||||
|
||||
// Create the object with base64 URL
|
||||
const imageObject = {
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: base64Url,
|
||||
},
|
||||
};
|
||||
|
||||
// Do something with imageObject, like sending it in a response or logging
|
||||
attached_image = base64Url;
|
||||
} else {
|
||||
console.log("The provided file is not an image.");
|
||||
}
|
||||
} else {
|
||||
console.log("No valid file provided in other_context_data.");
|
||||
}
|
||||
}
|
||||
|
||||
console.log("Running ASK for event listener: ", listener.description);
|
||||
|
@ -471,12 +512,12 @@ function registerListener(listener: EventListener) {
|
|||
- Payload: ${JSON.stringify(payload, null, 2)}
|
||||
|
||||
Follow the transcript provided in the payload.
|
||||
Reply only in plain text without markdown or any other formatting.
|
||||
|
||||
You response must be in plain text without markdown or any other formatting.
|
||||
`;
|
||||
|
||||
if (system_prompts) {
|
||||
prompt = `${system_prompts.map((p) => p.content).join("\n\n")}`;
|
||||
// console.log("Voice system Prompt: ", prompt);
|
||||
}
|
||||
|
||||
const response = !is_voice
|
||||
|
@ -486,9 +527,10 @@ function registerListener(listener: EventListener) {
|
|||
tools,
|
||||
})
|
||||
: await ask({
|
||||
model: "gpt-4o-mini",
|
||||
model: attached_image ? "gpt-4o" : "gpt-4o-mini",
|
||||
prompt,
|
||||
message: voice_prompt,
|
||||
image_url: attached_image,
|
||||
seed: `voice-anya-${listener.id}-${eventId}`,
|
||||
tools,
|
||||
});
|
||||
|
@ -503,7 +545,11 @@ function registerListener(listener: EventListener) {
|
|||
}
|
||||
|
||||
// Send a message to the user indicating the event was triggered
|
||||
if (notify) await contextMessage.send({ content });
|
||||
if (notify)
|
||||
await contextMessage.send({
|
||||
content,
|
||||
flags: is_voice ? [4096] : undefined,
|
||||
});
|
||||
else console.log("Silenced Notification: ", content);
|
||||
|
||||
// Handle auto-stop options
|
||||
|
|
Loading…
Reference in New Issue