yaze 0.3.2
Link to the Past ROM Editor
 
Loading...
Searching...
No Matches
gemini_ai_service.cc
Go to the documentation of this file.
2
3#include <atomic>
4#include <cstdlib>
5#include <iostream>
6#include <map>
7#include <mutex>
8#include <string>
9#include <vector>
10
11#include "absl/strings/str_cat.h"
12#include "absl/strings/str_format.h"
13#include "absl/strings/str_split.h"
14#include "absl/strings/strip.h"
15#include "absl/time/clock.h"
16#include "absl/time/time.h"
20#include "util/platform_paths.h"
21
22#if defined(__APPLE__)
23#include <TargetConditionals.h>
24#endif
25
26#if defined(__APPLE__) && \
27 (TARGET_OS_IPHONE == 1 || TARGET_IPHONE_SIMULATOR == 1)
29#define YAZE_AI_IOS_URLSESSION 1
30#endif
31
32#ifdef YAZE_WITH_JSON
33#include <filesystem>
34#include <fstream>
35
36#include "httplib.h"
37#include "nlohmann/json.hpp"
38
39// OpenSSL initialization for HTTPS support
40#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
41#include <openssl/crypto.h>
42#include <openssl/err.h>
43#include <openssl/ssl.h>
44
45// Global flag to track OpenSSL initialization
46static std::atomic<bool> g_openssl_initialized{false};
47static std::mutex g_openssl_init_mutex;
48
49static void InitializeOpenSSL() {
50 std::lock_guard<std::mutex> lock(g_openssl_init_mutex);
51 if (!g_openssl_initialized.exchange(true)) {
52 OPENSSL_init_ssl(
53 OPENSSL_INIT_LOAD_SSL_STRINGS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS,
54 nullptr);
55 std::cerr << "✓ OpenSSL initialized for HTTPS support" << std::endl;
56 }
57}
58#endif
59#endif
60
61namespace yaze {
62namespace cli {
63
64namespace {
65
66absl::StatusOr<nlohmann::json> BuildGeminiToolPayload(
67 const PromptBuilder& prompt_builder) {
68 auto declarations_or =
70 if (!declarations_or.ok()) {
71 return declarations_or.status();
72 }
73 return ToolSchemaBuilder::BuildGeminiTools(*declarations_or);
74}
75
76} // namespace
77
78GeminiAIService::GeminiAIService(const GeminiConfig& config)
79 : function_calling_enabled_(config.use_function_calling), config_(config) {
80 if (config_.verbose) {
81 std::cerr << "[DEBUG] Initializing Gemini service..." << std::endl;
82 std::cerr << "[DEBUG] Function calling: "
83 << (function_calling_enabled_ ? "enabled" : "disabled")
84 << std::endl;
85 std::cerr << "[DEBUG] Prompt version: " << config_.prompt_version
86 << std::endl;
87 }
88
89#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
90 // Initialize OpenSSL for HTTPS support
91 InitializeOpenSSL();
92 if (config_.verbose) {
93 std::cerr << "[DEBUG] OpenSSL initialized for HTTPS" << std::endl;
94 }
95#endif
96
97 // Load command documentation into prompt builder with specified version
98 std::string catalogue_path = config_.prompt_version == "v2"
99 ? "assets/agent/prompt_catalogue_v2.yaml"
100 : "assets/agent/prompt_catalogue.yaml";
101 if (auto status = prompt_builder_.LoadResourceCatalogue(catalogue_path);
102 !status.ok()) {
103 std::cerr << "⚠️ Failed to load agent prompt catalogue: "
104 << status.message() << std::endl;
105 }
106
107 if (config_.verbose) {
108 std::cerr << "[DEBUG] Loaded prompt catalogue" << std::endl;
109 }
110
111 if (config_.system_instruction.empty()) {
112 if (config_.verbose) {
113 std::cerr << "[DEBUG] Building system instruction..." << std::endl;
114 }
115
116 // Try to load version-specific system prompt file using FindAsset
117 std::string prompt_file;
118 if (config_.prompt_version == "v3") {
119 prompt_file = "agent/system_prompt_v3.txt";
120 } else if (config_.prompt_version == "v2") {
121 prompt_file = "agent/system_prompt_v2.txt";
122 } else {
123 prompt_file = "agent/system_prompt.txt";
124 }
125
126 auto prompt_path = util::PlatformPaths::FindAsset(prompt_file);
127 bool loaded = false;
128
129 if (prompt_path.ok()) {
130 std::ifstream file(prompt_path->string());
131 if (file.good()) {
132 std::stringstream buffer;
133 buffer << file.rdbuf();
134 config_.system_instruction = buffer.str();
135 if (config_.verbose) {
136 std::cerr << "[DEBUG] Loaded prompt: " << prompt_path->string()
137 << std::endl;
138 }
139 loaded = true;
140 }
141 }
142
143 if (!loaded) {
144 // Fallback to builder
145 if (config_.use_enhanced_prompting) {
146 config_.system_instruction =
147 prompt_builder_.BuildSystemInstructionWithExamples();
148 } else {
149 config_.system_instruction = BuildSystemInstruction();
150 }
151 }
152 }
153
154 if (config_.verbose) {
155 std::cerr << "[DEBUG] Gemini service initialized" << std::endl;
156 }
157}
158
160 function_calling_enabled_ = enable;
161}
162
163std::vector<std::string> GeminiAIService::GetAvailableTools() const {
164 return {"resource-list", "resource-search",
165 "dungeon-list-sprites", "dungeon-describe-room",
166 "overworld-find-tile", "overworld-describe-map",
167 "overworld-list-warps"};
168}
169
170std::string GeminiAIService::BuildSystemInstruction() {
171 // Fallback prompt if enhanced prompting is disabled
172 // Use PromptBuilder's basic system instruction
173 return prompt_builder_.BuildSystemInstruction();
174}
175
176void GeminiAIService::SetRomContext(Rom* rom) {
177 prompt_builder_.SetRom(rom);
178}
179
180absl::StatusOr<std::vector<ModelInfo>> GeminiAIService::ListAvailableModels() {
181#ifndef YAZE_WITH_JSON
182 return absl::UnimplementedError("Gemini AI service requires JSON support");
183#else
184 if (config_.api_key.empty()) {
185 // Return default known models if API key is missing
186 std::vector<ModelInfo> defaults = {
187 {.name = "gemini-3.0-preview",
188 .display_name = "Gemini 3.0 Preview",
189 .provider = kProviderGemini,
190 .description = "Cutting-edge model, currently in preview"},
191 {.name = "gemini-3.0-flash-preview",
192 .display_name = "Gemini 3.0 Flash Preview",
193 .provider = kProviderGemini,
194 .description = "Fastest preview model"},
195 {.name = "gemini-2.5-pro",
196 .display_name = "Gemini 2.5 Pro",
197 .provider = kProviderGemini,
198 .description = "High intelligence for complex tasks"},
199 {.name = "gemini-2.5-flash",
200 .display_name = "Gemini 2.5 Flash",
201 .provider = kProviderGemini,
202 .description = "Fastest multimodal model"}};
203 return defaults;
204 }
205
206 try {
207 std::string endpoint =
208 "https://generativelanguage.googleapis.com/v1beta/models?key=" +
209 config_.api_key;
210
211 if (config_.verbose) {
212 std::cerr << "[DEBUG] Listing models: "
213 << endpoint.substr(0, endpoint.find("key=")) << "...'"
214 << std::endl;
215 }
216
217 std::string response_str;
218#if defined(YAZE_AI_IOS_URLSESSION)
219 auto resp_or = ios::UrlSessionHttpRequest("GET", endpoint, {}, "", 8000);
220 if (!resp_or.ok()) {
221 if (config_.verbose) {
222 std::cerr << "[DEBUG] Gemini models request failed: "
223 << resp_or.status().message() << std::endl;
224 }
225 return absl::InternalError("Failed to list Gemini models");
226 }
227 response_str = resp_or->body;
228#else
229 // Use curl to list models from the API
230 std::string curl_cmd = "curl -s -X GET '" + endpoint + "' 2>&1";
231
232#ifdef _WIN32
233 FILE* pipe = _popen(curl_cmd.c_str(), "r");
234#else
235 FILE* pipe = popen(curl_cmd.c_str(), "r");
236#endif
237 if (!pipe) {
238 return absl::InternalError("Failed to execute curl command");
239 }
240
241 char buffer[4096];
242 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
243 response_str += buffer;
244 }
245
246#ifdef _WIN32
247 _pclose(pipe);
248#else
249 pclose(pipe);
250#endif
251#endif // YAZE_AI_IOS_URLSESSION
252
253 auto models_json = nlohmann::json::parse(response_str, nullptr, false);
254 if (models_json.is_discarded()) {
255 return absl::InternalError("Failed to parse Gemini models JSON");
256 }
257
258 if (!models_json.contains("models")) {
259 // Return defaults on error
260 std::vector<ModelInfo> defaults = {{.name = "gemini-2.5-flash",
261 .display_name = "Gemini 2.0 Flash",
262 .provider = kProviderGemini},
263 {.name = "gemini-1.5-flash",
264 .display_name = "Gemini 1.5 Flash",
265 .provider = kProviderGemini},
266 {.name = "gemini-1.5-pro",
267 .display_name = "Gemini 1.5 Pro",
268 .provider = kProviderGemini}};
269 return defaults;
270 }
271
272 std::vector<ModelInfo> models;
273 for (const auto& m : models_json["models"]) {
274 std::string name = m.value("name", "");
275 // Name comes as "models/gemini-pro", strip prefix
276 if (absl::StartsWith(name, "models/")) {
277 name = name.substr(7);
278 }
279
280 // Filter for gemini models
281 if (absl::StartsWith(name, "gemini")) {
282 ModelInfo info;
283 info.name = name;
284 info.display_name = m.value("displayName", name);
285 info.provider = kProviderGemini;
286 info.description = m.value("description", "");
287 info.family = "gemini";
288 info.is_local = false;
289 models.push_back(std::move(info));
290 }
291 }
292 return models;
293
294 } catch (const std::exception& e) {
295 return absl::InternalError(
296 absl::StrCat("Failed to list models: ", e.what()));
297 }
298#endif
299}
300
302#ifndef YAZE_WITH_JSON
303 return absl::UnimplementedError(
304 "Gemini AI service requires JSON support. Build with "
305 "-DYAZE_WITH_JSON=ON");
306#else
307 try {
308 if (config_.verbose) {
309 std::cerr << "[DEBUG] CheckAvailability: start" << std::endl;
310 }
311
312 if (config_.api_key.empty()) {
313 return absl::FailedPreconditionError(
314 "❌ Gemini API key not configured\n"
315 " Set GEMINI_API_KEY environment variable\n"
316 " Get your API key at: https://makersuite.google.com/app/apikey");
317 }
318
319 if (config_.verbose) {
320 std::cerr << "[DEBUG] CheckAvailability: creating HTTPS client"
321 << std::endl;
322 }
323 // Test API connectivity with a simple request
324 httplib::Client cli("https://generativelanguage.googleapis.com");
325 if (config_.verbose) {
326 std::cerr << "[DEBUG] CheckAvailability: client created" << std::endl;
327 }
328
329 cli.set_connection_timeout(5, 0); // 5 seconds timeout
330
331 if (config_.verbose) {
332 std::cerr << "[DEBUG] CheckAvailability: building endpoint" << std::endl;
333 }
334 std::string test_endpoint = "/v1beta/models/" + config_.model;
335 httplib::Headers headers = {
336 {"x-goog-api-key", config_.api_key},
337 };
338
339 if (config_.verbose) {
340 std::cerr << "[DEBUG] CheckAvailability: making request to "
341 << test_endpoint << std::endl;
342 }
343 auto res = cli.Get(test_endpoint.c_str(), headers);
344
345 if (config_.verbose) {
346 std::cerr << "[DEBUG] CheckAvailability: got response" << std::endl;
347 }
348
349 if (!res) {
350 return absl::UnavailableError(
351 "❌ Cannot reach Gemini API\n"
352 " Check your internet connection");
353 }
354
355 if (res->status == 401 || res->status == 403) {
356 return absl::PermissionDeniedError(
357 "❌ Invalid Gemini API key\n"
358 " Verify your key at: https://makersuite.google.com/app/apikey");
359 }
360
361 if (res->status == 404) {
362 return absl::NotFoundError(
363 absl::StrCat("❌ Model '", config_.model, "' not found\n",
364 " Try: gemini-2.5-flash or gemini-1.5-pro"));
365 }
366
367 if (res->status != 200) {
368 return absl::InternalError(absl::StrCat(
369 "❌ Gemini API error: ", res->status, "\n ", res->body));
370 }
371
372 return absl::OkStatus();
373 } catch (const std::exception& e) {
374 if (config_.verbose) {
375 std::cerr << "[DEBUG] CheckAvailability: EXCEPTION: " << e.what()
376 << std::endl;
377 }
378 return absl::InternalError(
379 absl::StrCat("Exception during availability check: ", e.what()));
380 } catch (...) {
381 if (config_.verbose) {
382 std::cerr << "[DEBUG] CheckAvailability: UNKNOWN EXCEPTION" << std::endl;
383 }
384 return absl::InternalError("Unknown exception during availability check");
385 }
386#endif
387}
388
389absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
390 const std::string& prompt) {
391 return GenerateResponse(
392 {{{agent::ChatMessage::Sender::kUser, prompt, absl::Now()}}});
393}
394
395absl::StatusOr<AgentResponse> GeminiAIService::GenerateResponse(
396 const std::vector<agent::ChatMessage>& history) {
397#ifndef YAZE_WITH_JSON
398 return absl::UnimplementedError(
399 "Gemini AI service requires JSON support. Build with "
400 "-DYAZE_WITH_JSON=ON");
401#else
402 if (history.empty()) {
403 return absl::InvalidArgumentError("History cannot be empty.");
404 }
405
406 // Build a structured conversation history for better context
407 // Gemini supports multi-turn conversations via the contents array
408 std::string prompt = prompt_builder_.BuildPromptFromHistory(history);
409
410 // Skip availability check - causes segfault with current SSL setup
411 // TODO: Fix SSL/TLS initialization issue
412 // if (auto status = CheckAvailability(); !status.ok()) {
413 // return status;
414 // }
415
416 if (config_.api_key.empty()) {
417 return absl::FailedPreconditionError("Gemini API key not configured");
418 }
419
420 absl::Time request_start = absl::Now();
421
422 try {
423 if (config_.verbose) {
424 std::cerr << "[DEBUG] Using curl for HTTPS request" << std::endl;
425 std::cerr << "[DEBUG] Processing " << history.size()
426 << " messages in history" << std::endl;
427 }
428
429 // Build conversation history for multi-turn context
430 // Gemini supports alternating user/model messages for better context
431 nlohmann::json contents = nlohmann::json::array();
432
433 // Add conversation history (up to last 10 messages for context window)
434 int start_idx = std::max(0, static_cast<int>(history.size()) - 10);
435 for (size_t i = start_idx; i < history.size(); ++i) {
436 const auto& msg = history[i];
437 std::string role =
438 (msg.sender == agent::ChatMessage::Sender::kUser) ? "user" : "model";
439
440 nlohmann::json message = {{"role", role},
441 {"parts", {{{"text", msg.message}}}}};
442 contents.push_back(message);
443 }
444
445 // If the last message is from the model, we need to ensure the conversation
446 // ends with a user message for Gemini
447 if (!history.empty() &&
448 history.back().sender == agent::ChatMessage::Sender::kAgent) {
449 // Add a continuation prompt
450 nlohmann::json user_continuation = {
451 {"role", "user"},
452 {"parts", {{{"text", "Please continue or clarify your response."}}}}};
453 contents.push_back(user_continuation);
454 }
455
456 // Build request with proper Gemini API v1beta format
457 nlohmann::json request_body = {
458 {"system_instruction",
459 {{"parts", {{"text", config_.system_instruction}}}}},
460 {"contents", contents},
461 {"generationConfig",
462 {{"temperature", config_.temperature},
463 {"maxOutputTokens", config_.max_output_tokens}}}};
464
465 if (config_.verbose) {
466 std::cerr << "[DEBUG] Sending " << contents.size()
467 << " conversation turns to Gemini" << std::endl;
468 }
469
470 // Only add responseMimeType if NOT using function calling
471 // (Gemini doesn't support both at the same time)
472 if (!function_calling_enabled_) {
473 request_body["generationConfig"]["responseMimeType"] = "application/json";
474 }
475
476 // Add function calling tools if enabled
477 if (function_calling_enabled_) {
478 auto tools_or = BuildGeminiToolPayload(prompt_builder_);
479 if (!tools_or.ok()) {
480 if (config_.verbose) {
481 std::cerr << "[DEBUG] Function calling schemas unavailable: "
482 << tools_or.status().message() << std::endl;
483 }
484 } else if (!tools_or->empty()) {
485 if (config_.verbose) {
486 std::string tools_str = tools_or->dump();
487 std::cerr << "[DEBUG] Function calling schemas: "
488 << tools_str.substr(0, 200) << "..." << std::endl;
489 }
490
491 request_body["tools"] = *tools_or;
492 }
493 }
494
495 std::string endpoint =
496 "https://generativelanguage.googleapis.com/v1beta/models/" +
497 config_.model + ":generateContent";
498 std::string response_str;
499#if defined(YAZE_AI_IOS_URLSESSION)
500 std::map<std::string, std::string> headers;
501 headers.emplace("Content-Type", "application/json");
502 headers.emplace("x-goog-api-key", config_.api_key);
503 auto resp_or = ios::UrlSessionHttpRequest("POST", endpoint, headers,
504 request_body.dump(), 60000);
505 if (!resp_or.ok()) {
506 return resp_or.status();
507 }
508 if (resp_or->status_code != 200) {
509 return absl::InternalError(absl::StrCat(
510 "Gemini API error: ", resp_or->status_code, "\n", resp_or->body));
511 }
512 response_str = resp_or->body;
513#else
514 // Write request body to temp file
515 std::string temp_file = "/tmp/gemini_request.json";
516 std::ofstream out(temp_file);
517 out << request_body.dump();
518 out.close();
519
520 // Use curl to make the request (avoiding httplib SSL issues)
521 std::string curl_cmd = "curl -s -X POST '" + endpoint +
522 "' "
523 "-H 'Content-Type: application/json' "
524 "-H 'x-goog-api-key: " +
525 config_.api_key +
526 "' "
527 "-d @" +
528 temp_file + " 2>&1";
529
530 if (config_.verbose) {
531 std::cerr << "[DEBUG] Executing API request..." << std::endl;
532 }
533
534#ifdef _WIN32
535 FILE* pipe = _popen(curl_cmd.c_str(), "r");
536#else
537 FILE* pipe = popen(curl_cmd.c_str(), "r");
538#endif
539 if (!pipe) {
540 return absl::InternalError("Failed to execute curl command");
541 }
542
543 char buffer[4096];
544 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
545 response_str += buffer;
546 }
547
548#ifdef _WIN32
549 int status = _pclose(pipe);
550#else
551 int status = pclose(pipe);
552#endif
553 std::remove(temp_file.c_str());
554
555 if (status != 0) {
556 return absl::InternalError(
557 absl::StrCat("Curl failed with status ", status));
558 }
559#endif // YAZE_AI_IOS_URLSESSION
560
561 if (response_str.empty()) {
562 return absl::InternalError("Empty response from Gemini API");
563 }
564
565 // Debug: print response
566 if (config_.verbose) {
567 std::cout << "\n"
568 << "\033[35m"
569 << "🔍 Raw Gemini API Response:"
570 << "\033[0m"
571 << "\n"
572 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
573 << "\n\n";
574 }
575
576 if (config_.verbose) {
577 std::cerr << "[DEBUG] Parsing response..." << std::endl;
578 }
579 auto parsed_or = ParseGeminiResponse(response_str);
580 if (!parsed_or.ok()) {
581 return parsed_or.status();
582 }
583 AgentResponse agent_response = std::move(parsed_or.value());
584 agent_response.provider = kProviderGemini;
585 agent_response.model = config_.model;
586 agent_response.latency_seconds =
587 absl::ToDoubleSeconds(absl::Now() - request_start);
588 agent_response.parameters["prompt_version"] = config_.prompt_version;
589 agent_response.parameters["temperature"] =
590 absl::StrFormat("%.2f", config_.temperature);
591 agent_response.parameters["max_output_tokens"] =
592 absl::StrFormat("%d", config_.max_output_tokens);
593 agent_response.parameters["function_calling"] =
594 function_calling_enabled_ ? "true" : "false";
595 return agent_response;
596
597 } catch (const std::exception& e) {
598 if (config_.verbose) {
599 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
600 }
601 return absl::InternalError(
602 absl::StrCat("Exception during generation: ", e.what()));
603 } catch (...) {
604 if (config_.verbose) {
605 std::cerr << "[ERROR] Unknown exception" << std::endl;
606 }
607 return absl::InternalError("Unknown exception during generation");
608 }
609#endif
610}
611
612absl::StatusOr<AgentResponse> GeminiAIService::ParseGeminiResponse(
613 const std::string& response_body) {
614#ifndef YAZE_WITH_JSON
615 return absl::UnimplementedError("JSON support required");
616#else
617 AgentResponse agent_response;
618
619 auto response_json = nlohmann::json::parse(response_body, nullptr, false);
620 if (response_json.is_discarded()) {
621 return absl::InternalError("❌ Failed to parse Gemini response JSON");
622 }
623
624 // Navigate Gemini's response structure
625 if (!response_json.contains("candidates") ||
626 response_json["candidates"].empty()) {
627 return absl::InternalError("❌ No candidates in Gemini response");
628 }
629
630 for (const auto& candidate : response_json["candidates"]) {
631 if (!candidate.contains("content") ||
632 !candidate["content"].contains("parts")) {
633 continue;
634 }
635
636 for (const auto& part : candidate["content"]["parts"]) {
637 if (part.contains("text")) {
638 std::string text_content = part["text"].get<std::string>();
639
640 // Debug: Print raw LLM output when verbose mode is enabled
641 if (config_.verbose) {
642 std::cout << "\n"
643 << "\033[35m"
644 << "🔍 Raw LLM Response:"
645 << "\033[0m"
646 << "\n"
647 << "\033[2m" << text_content << "\033[0m"
648 << "\n\n";
649 }
650
651 // Strip markdown code blocks if present (```json ... ```)
652 text_content = std::string(absl::StripAsciiWhitespace(text_content));
653 if (absl::StartsWith(text_content, "```json")) {
654 text_content = text_content.substr(7); // Remove ```json
655 } else if (absl::StartsWith(text_content, "```")) {
656 text_content = text_content.substr(3); // Remove ```
657 }
658 if (absl::EndsWith(text_content, "```")) {
659 text_content = text_content.substr(0, text_content.length() - 3);
660 }
661 text_content = std::string(absl::StripAsciiWhitespace(text_content));
662
663 // Try to parse as JSON object
664 auto parsed_text = nlohmann::json::parse(text_content, nullptr, false);
665 if (!parsed_text.is_discarded()) {
666 // Extract text_response
667 if (parsed_text.contains("text_response") &&
668 parsed_text["text_response"].is_string()) {
669 agent_response.text_response =
670 parsed_text["text_response"].get<std::string>();
671 }
672
673 // Extract reasoning
674 if (parsed_text.contains("reasoning") &&
675 parsed_text["reasoning"].is_string()) {
676 agent_response.reasoning =
677 parsed_text["reasoning"].get<std::string>();
678 }
679
680 // Extract commands
681 if (parsed_text.contains("commands") &&
682 parsed_text["commands"].is_array()) {
683 for (const auto& cmd : parsed_text["commands"]) {
684 if (cmd.is_string()) {
685 std::string command = cmd.get<std::string>();
686 if (absl::StartsWith(command, "z3ed ")) {
687 command = command.substr(5);
688 }
689 agent_response.commands.push_back(command);
690 }
691 }
692 }
693
694 // Extract tool_calls from the parsed JSON
695 if (parsed_text.contains("tool_calls") &&
696 parsed_text["tool_calls"].is_array()) {
697 for (const auto& call : parsed_text["tool_calls"]) {
698 if (call.contains("tool_name") && call["tool_name"].is_string()) {
699 ToolCall tool_call;
700 tool_call.tool_name = call["tool_name"].get<std::string>();
701
702 if (call.contains("args") && call["args"].is_object()) {
703 for (auto& [key, value] : call["args"].items()) {
704 if (value.is_string()) {
705 tool_call.args[key] = value.get<std::string>();
706 } else if (value.is_number()) {
707 tool_call.args[key] = std::to_string(value.get<double>());
708 } else if (value.is_boolean()) {
709 tool_call.args[key] =
710 value.get<bool>() ? "true" : "false";
711 }
712 }
713 }
714 agent_response.tool_calls.push_back(tool_call);
715 }
716 }
717 }
718 } else {
719 // If parsing the full object fails, fallback to extracting commands
720 // from text
721 std::vector<std::string> lines = absl::StrSplit(text_content, '\n');
722 for (const auto& line : lines) {
723 std::string trimmed = std::string(absl::StripAsciiWhitespace(line));
724 if (!trimmed.empty() && (absl::StartsWith(trimmed, "z3ed ") ||
725 absl::StartsWith(trimmed, "palette ") ||
726 absl::StartsWith(trimmed, "overworld ") ||
727 absl::StartsWith(trimmed, "sprite ") ||
728 absl::StartsWith(trimmed, "dungeon "))) {
729 if (absl::StartsWith(trimmed, "z3ed ")) {
730 trimmed = trimmed.substr(5);
731 }
732 agent_response.commands.push_back(trimmed);
733 }
734 }
735 }
736 } else if (part.contains("functionCall")) {
737 const auto& call = part["functionCall"];
738 if (call.contains("name") && call["name"].is_string()) {
739 ToolCall tool_call;
740 tool_call.tool_name = call["name"].get<std::string>();
741 if (call.contains("args") && call["args"].is_object()) {
742 for (auto& [key, value] : call["args"].items()) {
743 if (value.is_string()) {
744 tool_call.args[key] = value.get<std::string>();
745 } else if (value.is_number()) {
746 tool_call.args[key] = std::to_string(value.get<double>());
747 }
748 }
749 }
750 agent_response.tool_calls.push_back(tool_call);
751 }
752 }
753 }
754 }
755
756 if (agent_response.text_response.empty() && agent_response.commands.empty() &&
757 agent_response.tool_calls.empty()) {
758 return absl::InternalError(
759 "❌ No valid response extracted from Gemini\n"
760 " Expected at least one of: text_response, commands, or tool_calls\n"
761 " Raw response: " +
762 response_body);
763 }
764
765 return agent_response;
766#endif
767}
768
769absl::StatusOr<std::string> GeminiAIService::EncodeImageToBase64(
770 const std::string& image_path) const {
771#ifndef YAZE_WITH_JSON
772 (void)image_path; // Suppress unused parameter warning
773 return absl::UnimplementedError(
774 "Gemini AI service requires JSON support. Build with "
775 "-DYAZE_WITH_JSON=ON");
776#else
777 std::ifstream file(image_path, std::ios::binary);
778 if (!file.is_open()) {
779 return absl::NotFoundError(
780 absl::StrCat("Failed to open image file: ", image_path));
781 }
782
783 // Read file into buffer
784 file.seekg(0, std::ios::end);
785 size_t size = file.tellg();
786 file.seekg(0, std::ios::beg);
787
788 std::vector<unsigned char> buffer(size);
789 if (!file.read(reinterpret_cast<char*>(buffer.data()), size)) {
790 return absl::InternalError("Failed to read image file");
791 }
792
793 // Base64 encode
794 static const char* base64_chars =
795 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
796
797 std::string encoded;
798 encoded.reserve(((size + 2) / 3) * 4);
799
800 int i = 0;
801 int j = 0;
802 unsigned char char_array_3[3];
803 unsigned char char_array_4[4];
804
805 for (size_t idx = 0; idx < size; idx++) {
806 char_array_3[i++] = buffer[idx];
807 if (i == 3) {
808 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
809 char_array_4[1] =
810 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
811 char_array_4[2] =
812 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
813 char_array_4[3] = char_array_3[2] & 0x3f;
814
815 for (i = 0; i < 4; i++)
816 encoded += base64_chars[char_array_4[i]];
817 i = 0;
818 }
819 }
820
821 if (i) {
822 for (j = i; j < 3; j++)
823 char_array_3[j] = '\0';
824
825 char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
826 char_array_4[1] =
827 ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
828 char_array_4[2] =
829 ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
830
831 for (j = 0; j < i + 1; j++)
832 encoded += base64_chars[char_array_4[j]];
833
834 while (i++ < 3)
835 encoded += '=';
836 }
837
838 return encoded;
839#endif
840}
841
842absl::StatusOr<AgentResponse> GeminiAIService::GenerateMultimodalResponse(
843 const std::string& image_path, const std::string& prompt) {
844#ifndef YAZE_WITH_JSON
845 (void)image_path; // Suppress unused parameter warnings
846 (void)prompt;
847 return absl::UnimplementedError(
848 "Gemini AI service requires JSON support. Build with "
849 "-DYAZE_WITH_JSON=ON");
850#else
851 if (config_.api_key.empty()) {
852 return absl::FailedPreconditionError("Gemini API key not configured");
853 }
854
855 // Determine MIME type from file extension
856 std::string mime_type = "image/png";
857 if (image_path.ends_with(".jpg") || image_path.ends_with(".jpeg")) {
858 mime_type = "image/jpeg";
859 } else if (image_path.ends_with(".bmp")) {
860 mime_type = "image/bmp";
861 } else if (image_path.ends_with(".webp")) {
862 mime_type = "image/webp";
863 }
864
865 // Encode image to base64
866 auto encoded_or = EncodeImageToBase64(image_path);
867 if (!encoded_or.ok()) {
868 return encoded_or.status();
869 }
870 std::string encoded_image = std::move(encoded_or.value());
871
872 try {
873 if (config_.verbose) {
874 std::cerr << "[DEBUG] Preparing multimodal request with image"
875 << std::endl;
876 }
877
878 // Build multimodal request with image and text
879 nlohmann::json request_body = {
880 {"contents",
881 {{{"parts",
882 {{{"inline_data",
883 {{"mime_type", mime_type}, {"data", encoded_image}}}},
884 {{"text", prompt}}}}}}},
885 {"generationConfig",
886 {{"temperature", config_.temperature},
887 {"maxOutputTokens", config_.max_output_tokens}}}};
888
889 std::string endpoint =
890 "https://generativelanguage.googleapis.com/v1beta/models/" +
891 config_.model + ":generateContent";
892 std::string response_str;
893#if defined(YAZE_AI_IOS_URLSESSION)
894 std::map<std::string, std::string> headers;
895 headers.emplace("Content-Type", "application/json");
896 headers.emplace("x-goog-api-key", config_.api_key);
897 auto resp_or = ios::UrlSessionHttpRequest("POST", endpoint, headers,
898 request_body.dump(), 60000);
899 if (!resp_or.ok()) {
900 return resp_or.status();
901 }
902 if (resp_or->status_code != 200) {
903 return absl::InternalError(absl::StrCat(
904 "Gemini API error: ", resp_or->status_code, "\n", resp_or->body));
905 }
906 response_str = resp_or->body;
907#else
908 // Write request body to temp file
909 std::string temp_file = "/tmp/gemini_multimodal_request.json";
910 std::ofstream out(temp_file);
911 out << request_body.dump();
912 out.close();
913
914 // Use curl to make the request
915 std::string curl_cmd = "curl -s -X POST '" + endpoint +
916 "' "
917 "-H 'Content-Type: application/json' "
918 "-H 'x-goog-api-key: " +
919 config_.api_key +
920 "' "
921 "-d @" +
922 temp_file + " 2>&1";
923
924 if (config_.verbose) {
925 std::cerr << "[DEBUG] Executing multimodal API request..." << std::endl;
926 }
927
928#ifdef _WIN32
929 FILE* pipe = _popen(curl_cmd.c_str(), "r");
930#else
931 FILE* pipe = popen(curl_cmd.c_str(), "r");
932#endif
933 if (!pipe) {
934 return absl::InternalError("Failed to execute curl command");
935 }
936
937 char buffer[4096];
938 while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
939 response_str += buffer;
940 }
941
942#ifdef _WIN32
943 int status = _pclose(pipe);
944#else
945 int status = pclose(pipe);
946#endif
947 std::remove(temp_file.c_str());
948
949 if (status != 0) {
950 return absl::InternalError(
951 absl::StrCat("Curl failed with status ", status));
952 }
953#endif // YAZE_AI_IOS_URLSESSION
954
955 if (response_str.empty()) {
956 return absl::InternalError("Empty response from Gemini API");
957 }
958
959 if (config_.verbose) {
960 std::cout << "\n"
961 << "\033[35m"
962 << "🔍 Raw Gemini Multimodal Response:"
963 << "\033[0m"
964 << "\n"
965 << "\033[2m" << response_str.substr(0, 500) << "\033[0m"
966 << "\n\n";
967 }
968
969 return ParseGeminiResponse(response_str);
970
971 } catch (const std::exception& e) {
972 if (config_.verbose) {
973 std::cerr << "[ERROR] Exception: " << e.what() << std::endl;
974 }
975 return absl::InternalError(
976 absl::StrCat("Exception during multimodal generation: ", e.what()));
977 }
978#endif
979}
980
981} // namespace cli
982} // namespace yaze
absl::StatusOr< AgentResponse > GenerateMultimodalResponse(const std::string &, const std::string &)
std::vector< std::string > GetAvailableTools() const
absl::StatusOr< std::vector< ModelInfo > > ListAvailableModels() override
GeminiAIService(const GeminiConfig &)
void SetRomContext(Rom *) override
absl::StatusOr< AgentResponse > GenerateResponse(const std::string &prompt) override
static nlohmann::json BuildGeminiTools(const nlohmann::json &function_declarations)
static absl::StatusOr< nlohmann::json > ResolveFunctionDeclarations(const PromptBuilder &prompt_builder)
static absl::StatusOr< std::filesystem::path > FindAsset(const std::string &relative_path)
Find an asset file in multiple standard locations.
absl::StatusOr< nlohmann::json > BuildGeminiToolPayload(const PromptBuilder &prompt_builder)
absl::StatusOr< UrlSessionHttpResponse > UrlSessionHttpRequest(const std::string &method, const std::string &url, const std::map< std::string, std::string > &headers, const std::string &body, int timeout_ms)
constexpr char kProviderGemini[]
Definition provider_ids.h:9