bkataru

ollama.zig

a single file zig 0.15.2 client for da Ollama REST API

no comments

here's a clean version

const std = @import("std");
const http = std.http;
const json = std.json;
const mem = std.mem;
const testing = std.testing;

pub const Ollama = struct {
    allocator: mem.Allocator,
    client: http.Client,
    base_url: []const u8,

    pub const Error = error{
        RequestFailed,
        InvalidResponse,
        NetworkError,
        EndOfStream,
        ReadFailed,
        JsonParseError,
    } || mem.Allocator.Error || http.Client.RequestError || http.Client.FetchError || http.Client.ConnectError || json.ParseError(json.Scanner);

    pub fn init(allocator: mem.Allocator, base_url: ?[]const u8) !Ollama {
        return .{
            .allocator = allocator,
            .client = http.Client{ .allocator = allocator },
            .base_url = base_url orelse "http://localhost:11434",
        };
    }

    pub fn deinit(self: *Ollama) void {
        self.client.deinit();
    }

    pub const Message = struct {
        role: []const u8,
        content: []const u8,
    };

    pub const GenerateOptions = struct {
        model: []const u8 = "granite4:tiny-h",
        temperature: ?f32 = null,
        top_p: ?f32 = null,
        top_k: ?f32 = null,
        num_predict: ?i32 = null,
        stop: ?[]const []const u8 = null,
        seed: ?i32 = null,
        stream: bool = false,
    };

    pub const ChatOptions = struct {
        model: []const u8 = "granite4:tiny-h",
        temperature: ?f32 = null,
        top_p: ?f32 = null,
        top_k: ?i32 = null,
        num_predict: ?i32 = null,
        stop: ?[]const []const u8 = null,
        seed: ?i32 = null,
        stream: bool = false,
    };

    pub const GenerateResponse = struct {
        response: []const u8,
        model: []const u8,
        done: bool,
        allocator: mem.Allocator,

        pub fn deinit(self: GenerateResponse) void {
            self.allocator.free(self.response);
            self.allocator.free(self.model);
        }
    };

    pub const ChatResponse = struct {
        message: Message,
        done: bool,
        allocator: mem.Allocator,

        pub fn deinit(self: ChatResponse) void {
            self.allocator.free(self.message.role);
            self.allocator.free(self.message.content);
        }
    };

    pub const ModelInfo = struct {
        name: []const u8,
        modified_at: []const u8,
        size: i64,
        allocator: mem.Allocator,

        pub fn deinit(self: ModelInfo) void {
            self.allocator.free(self.name);
            self.allocator.free(self.modified_at);
        }
    };

    pub const ListResponse = struct {
        models: []ModelInfo,
        allocator: mem.Allocator,

        pub fn deinit(self: ListResponse) void {
            for (self.models) |model| {
                model.deinit();
            }
            self.allocator.free(self.models);
        }
    };

    pub fn generate(
        self: *Ollama,
        prompt: []const u8,
        options: GenerateOptions,
    ) Error!GenerateResponse {
        var url_buffer = std.ArrayListUnmanaged(u8).empty;
        defer url_buffer.deinit(self.allocator);

        try url_buffer.writer(self.allocator).print("{s}/api/generate", .{self.base_url});
        const url = url_buffer.items;

        var request_body_buffer = std.ArrayListUnmanaged(u8).empty;
        defer request_body_buffer.deinit(self.allocator);

        try request_body_buffer.appendSlice(self.allocator, "{\"model\":\"");
        try request_body_buffer.appendSlice(self.allocator, options.model);
        try request_body_buffer.appendSlice(self.allocator, "\",\"prompt\":\"");
        try self.jsonEscape(&request_body_buffer, prompt);
        try request_body_buffer.appendSlice(self.allocator, "\"");

        if (options.temperature) |temp| {
            try request_body_buffer.writer(self.allocator).print(",\"temperature\":{d}", .{temp});
        }
        if (options.top_p) |top_p| {
            try request_body_buffer.writer(self.allocator).print(",\"top_p\":{d}", .{top_p});
        }
        if (options.top_k) |top_k| {
            try request_body_buffer.writer(self.allocator).print(",\"top_k\":{d}", .{top_k});
        }
        if (options.num_predict) |num| {
            try request_body_buffer.writer(self.allocator).print(",\"num_predict\":{d}", .{num});
        }
        if (options.seed) |seed| {
            try request_body_buffer.writer(self.allocator).print(",\"seed\":{d}", .{seed});
        }
        try request_body_buffer.writer(self.allocator).print(",\"stream\":{}", .{options.stream});
        try request_body_buffer.appendSlice(self.allocator, "}");

        const request_body = request_body_buffer.items;

        const uri = try std.Uri.parse(url);

        var req = try self.client.request(.POST, uri, .{
            .extra_headers = &.{
                .{ .name = "Content-Type", .value = "application/json" },
            },
        });
        defer req.deinit();
        req.transfer_encoding = .{ .content_length = request_body.len };

        std.debug.print("Generate: Sending body of length: {}\n", .{request_body.len});

        var body_writer = try req.sendBody(&.{});
        try body_writer.writer.writeAll(request_body);
        try body_writer.end();
        std.debug.print("Generate: Sent body bytes and finalized\n", .{});

        var response = try req.receiveHead(&.{});

        if (response.head.status != .ok) {
            std.debug.print("Generate request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }

        var response_buffer: [1024 * 1024 * 10]u8 = undefined;
        var response_writer: std.Io.Writer = .fixed(&response_buffer);
        var read_buffer: [4096]u8 = undefined;
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        const n = try body_reader.stream(&response_writer, @enumFromInt(response_buffer.len));

        const response_json = response_buffer[0..n];
        std.debug.print("Generate response body ({} bytes): {s}\n", .{ n, response_json });
        const parsed: json.Parsed(json.Value) = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{},
        );
        defer parsed.deinit();

        const obj = parsed.value.object;
        const response_text = obj.get("response").?.string;
        const model_name = obj.get("model").?.string;
        const done = obj.get("done").?.bool;

        return GenerateResponse{
            .response = try self.allocator.dupe(u8, response_text),
            .model = try self.allocator.dupe(u8, model_name),
            .done = done,
            .allocator = self.allocator,
        };
    }

    pub fn chat(
        self: *Ollama,
        messages: []const Message,
        options: ChatOptions,
    ) Error!ChatResponse {
        var url_buffer = std.ArrayListUnmanaged(u8).empty;
        defer url_buffer.deinit(self.allocator);

        try url_buffer.writer(self.allocator).print("{s}/api/chat", .{self.base_url});
        const url = url_buffer.items;

        var request_body_buffer = std.ArrayListUnmanaged(u8).empty;
        defer request_body_buffer.deinit(self.allocator);

        try request_body_buffer.appendSlice(self.allocator, "{\"model\":\"");
        try request_body_buffer.appendSlice(self.allocator, options.model);
        try request_body_buffer.appendSlice(self.allocator, "\",\"messages\":[");

        for (messages, 0..) |msg, i| {
            if (i > 0) try request_body_buffer.appendSlice(self.allocator, ",");
            try request_body_buffer.appendSlice(self.allocator, "{\"role\":\"");
            try request_body_buffer.appendSlice(self.allocator, msg.role);
            try request_body_buffer.appendSlice(self.allocator, "\",\"content\":\"");
            try self.jsonEscape(&request_body_buffer, msg.content);
            try request_body_buffer.appendSlice(self.allocator, "\"}");
        }

        try request_body_buffer.appendSlice(self.allocator, "]");

        // Add optional parameters (same pattern as generate())
        if (options.temperature) |temp| {
            try request_body_buffer.writer(self.allocator).print(",\"temperature\":{d}", .{temp});
        }
        if (options.top_p) |top_p| {
            try request_body_buffer.writer(self.allocator).print(",\"top_p\":{d}", .{top_p});
        }
        if (options.top_k) |top_k| {
            try request_body_buffer.writer(self.allocator).print(",\"top_k\":{d}", .{top_k});
        }
        if (options.num_predict) |num| {
            try request_body_buffer.writer(self.allocator).print(",\"num_predict\":{d}", .{num});
        }
        if (options.seed) |seed| {
            try request_body_buffer.writer(self.allocator).print(",\"seed\":{d}", .{seed});
        }
        try request_body_buffer.writer(self.allocator).print(",\"stream\":{}", .{options.stream});
        try request_body_buffer.appendSlice(self.allocator, "}");

        const request_body = request_body_buffer.items;

        std.debug.print("Chat request body: {s}\n", .{request_body});

        const uri = try std.Uri.parse(url);

        // Create HTTP POST request
        var req = try self.client.request(.POST, uri, .{
            .extra_headers = &.{
                .{ .name = "Content-Type", .value = "application/json" },
            },
        });
        defer req.deinit();

        req.transfer_encoding = .{ .content_length = request_body.len };

        std.debug.print("Chat: Sending body of length: {}\n", .{request_body.len});

        var body_writer = try req.sendBody(&.{});
        try body_writer.writer.writeAll(request_body);
        try body_writer.end();

        std.debug.print("Chat: Body sent successfully\n", .{});

        var response = try req.receiveHead(&.{});

        const response_buffer = try self.allocator.alloc(u8, 10 * 1024 * 1024);
        defer self.allocator.free(response_buffer);
        var response_writer: std.Io.Writer = .fixed(response_buffer);
        var read_buffer: [4096]u8 = undefined;
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        const n = try body_reader.streamRemaining(&response_writer);

        const response_json = response_buffer[0..n];
        std.debug.print("Chat response status: {}, body ({} bytes): {s}\n", .{ response.head.status, n, response_json });

        if (response.head.status != .ok) {
            std.debug.print("Chat request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }
        const parsed: json.Parsed(json.Value) = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{},
        );
        defer parsed.deinit();

        const obj = parsed.value.object;
        const msg_obj = obj.get("message").?.object;
        const role = msg_obj.get("role").?.string;
        const content = msg_obj.get("content").?.string;
        const done = obj.get("done").?.bool;

        return ChatResponse{
            .message = Message{
                .role = try self.allocator.dupe(u8, role),
                .content = try self.allocator.dupe(u8, content),
            },
            .done = done,
            .allocator = self.allocator,
        };
    }

    pub fn listModels(self: *Ollama) Error!ListResponse {
        var url_buffer = std.ArrayListUnmanaged(u8).empty;
        defer url_buffer.deinit(self.allocator);

        try url_buffer.writer(self.allocator).print("{s}/api/tags", .{self.base_url});
        const url = url_buffer.items;

        const uri = try std.Uri.parse(url);

        var req = try self.client.request(.GET, uri, .{});
        defer req.deinit();

        try req.sendBodiless();
        var response = try req.receiveHead(&.{});

        if (response.head.status != .ok) {
            std.debug.print("ListModels request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }

        const response_buffer = try self.allocator.alloc(u8, 10 * 1024 * 1024);
        defer self.allocator.free(response_buffer);
        var response_writer: std.Io.Writer = .fixed(response_buffer);
        var read_buffer: [4096]u8 = undefined;
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        const n = try body_reader.streamRemaining(&response_writer);

        const response_json = response_buffer[0..n];
        std.debug.print("ListModels respnse body ({} bytes): {s}\n", .{ n, response_json });
        const parsed = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{},
        );
        defer parsed.deinit();

        const obj = parsed.value.object;
        const models_array = obj.get("models").?.array;

        var models = try self.allocator.alloc(ModelInfo, models_array.items.len);

        for (models_array.items, 0..) |model_val, i| {
            const model_obj = model_val.object;

            models[i] = ModelInfo{
                .name = try self.allocator.dupe(u8, model_obj.get("name").?.string),
                .modified_at = try self.allocator.dupe(u8, model_obj.get("modified_at").?.string),
                .size = model_obj.get("size").?.integer,
                .allocator = self.allocator,
            };
        }

        return ListResponse{
            .models = models,
            .allocator = self.allocator,
        };
    }

    fn jsonEscape(self: *Ollama, buffer: *std.ArrayList(u8), text: []const u8) !void {
        for (text) |c| {
            switch (c) {
                '"' => try buffer.appendSlice(self.allocator, "\\\""),
                '\\' => try buffer.appendSlice(self.allocator, "\\\\"),
                '\n' => try buffer.appendSlice(self.allocator, "\\n"),
                '\r' => try buffer.appendSlice(self.allocator, "\\r"),
                '\t' => try buffer.appendSlice(self.allocator, "\\t"),
                else => try buffer.append(self.allocator, c),
            }
        }
    }
};

test "OllamaClient - init and deinit" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    try testing.expect(client.base_url.len > 0);
    try testing.expectEqualStrings("http://localhost:11434", client.base_url);
}

test "Ollama - init with custom URL" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), "http://192.168.1.100:11434");
    defer client.deinit();

    try testing.expectEqualStrings("http://192.168.1.100:11434", client.base_url);
}

test "Ollama - Message structure" {
    const msg = Ollama.Message{
        .role = "user",
        .content = "Hello, Ollama!",
    };

    try testing.expectEqualStrings("user", msg.role);
    try testing.expectEqualStrings("Hello, Ollama!", msg.content);
}

test "Ollama - GenerateOptions defaults" {
    const opts = Ollama.GenerateOptions{};

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature == null);
    try testing.expect(opts.seed == null);
    try testing.expect(opts.stream == false);
}

test "Ollama - GenerateOptions custom" {
    const opts = Ollama.GenerateOptions{
        .model = "granite4:tiny-h",
        .temperature = 0.7,
        .seed = 123,
        .stream = false,
        .num_predict = 100,
    };

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature.? == 0.7);
    try testing.expect(opts.seed.? == 123);
    try testing.expect(opts.stream == false);
    try testing.expect(opts.num_predict.? == 100);
}

test "Ollama - ChatOptions defaults" {
    const opts = Ollama.ChatOptions{};

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature == null);
    try testing.expect(opts.seed == null);
    try testing.expect(opts.stream == false);
}

test "Ollama - listModels integration" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const list_response: Ollama.ListResponse = try client.listModels();
    defer list_response.deinit();

    std.debug.print("\nAvailable models: {d}\n", .{list_response.models.len});
    for (list_response.models) |model| {
        std.debug.print("  - {s} (size: {d})\n", .{ model.name, model.size });
    }

    try testing.expect(list_response.models.len >= 0);
}

test "Ollama - generate integration" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const response: Ollama.GenerateResponse = try client.generate("Say hello in one word", .{
        .model = "granite4:tiny-h",
    });
    defer response.deinit();

    std.debug.print("\nGenerate Response: {s}\n", .{response.response});
    std.debug.print("Model: {s}, Done: {}\n", .{ response.model, response.done });

    try testing.expect(response.response.len > 0);
    try testing.expect(response.done);
}

test "Ollama - chat integration" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const messages = [_]Ollama.Message{
        .{ .role = "user", .content = "What is the capital of France?" },
    };

    const response: Ollama.ChatResponse = try client.chat(&messages, .{
        .model = "granite4:tiny-h",
    });
    defer response.deinit();

    std.debug.print("\nChat Response: {s}\n", .{response.message.content});
    std.debug.print("Role: {s}, Done: {}\n", .{ response.message.role, response.done });

    try testing.expect(response.message.content.len > 0);
    try testing.expectEqualStrings("assistant", response.message.role);
    try testing.expect(response.done);
}

da comments

NOTE: COMPOSER 1 WENT HAYWIRE WITH THE COMMENTS, YIKES.

//! Ollama API Client for Zig
//!
//! This module provides a complete HTTP client implementation for interacting with the Ollama API,
//! a local LLM inference server. The client handles HTTP communication, JSON serialization/deserialization,
//! and memory management using Zig's explicit memory allocation patterns.
//!
//! # Architecture Overview
//!
//! The client is built around Zig's `std.http.Client` from the standard library, which provides
//! a modern, async-capable HTTP client implementation. This wrapper adds:
//! - Structured API endpoints for Ollama's REST API
//! - JSON request/response handling
//! - Memory-safe string handling with explicit allocators
//! - Error handling with comprehensive error unions
//!
//! # Memory Management
//!
//! All memory allocations are explicit and managed through the `allocator` field. This follows Zig's
//! philosophy of explicit memory management. Callers must:
//! 1. Provide an allocator when initializing the client
//! 2. Call `deinit()` on the client when done
//! 3. Call `deinit()` on response structs to free their allocated memory
//!
//! # HTTP Request Flow
//!
//! For POST requests (chat, generate):
//! 1. Build JSON request body incrementally using `ArrayListUnmanaged`
//! 2. Parse URI from base URL + endpoint
//! 3. Create HTTP request with appropriate headers
//! 4. Set transfer encoding (content-length for POST requests)
//! 5. Send body using BodyWriter API (write data, then call `end()`)
//! 6. Receive response headers
//! 7. Read response body into buffer
//! 8. Parse JSON response
//! 9. Extract and duplicate strings (they're owned by the response buffer)
//!
//! For GET requests (listModels):
//! 1. Parse URI
//! 2. Create HTTP request
//! 3. Call `sendBodiless()` since no body is needed
//! 4. Receive response headers
//! 5. Read and parse response body
//!
//! # Zig-Specific Patterns Used
//!
//! - **Error Unions**: Functions return `Error!ReturnType` where Error is a union of possible errors
//! - **Optional Types**: `?Type` syntax for nullable values
//! - **Defer**: Automatic cleanup using `defer` statements
//! - **ArrayListUnmanaged**: Memory-efficient dynamic arrays where caller manages allocation
//! - **Slice Syntax**: `[]const u8` for string slices (no null termination needed)
//! - **Struct Initialization**: `.{ ... }` syntax for anonymous struct literals
//! - **Error Propagation**: `try` keyword for propagating errors up the call stack
//!
//! # JSON Handling
//!
//! JSON is manually constructed for requests (avoiding external dependencies) and parsed using
//! `std.json` for responses. Manual construction allows precise control over escaping and formatting.
//! The `jsonEscape` helper function ensures proper escaping of special characters in strings.

const std = @import("std");
const http = std.http;
const json = std.json;
const mem = std.mem;
const testing = std.testing;

/// Ollama API client for local LLM inference
///
/// This struct encapsulates all state needed to communicate with an Ollama server.
/// It maintains an HTTP client instance and base URL configuration.
///
/// # Example Usage
///
/// ```zig
/// var gpa = std.heap.GeneralPurposeAllocator(.{}){};
/// defer _ = gpa.deinit();
/// var client = try Ollama.init(gpa.allocator(), null);
/// defer client.deinit();
///
/// const messages = [_]Ollama.Message{
///     .{ .role = "user", .content = "Hello!" },
/// };
/// const response = try client.chat(&messages, .{});
/// defer response.deinit();
/// ```
pub const Ollama = struct {
    /// Memory allocator used for all dynamic allocations.
    /// This is passed to all std library functions that need allocation.
    allocator: mem.Allocator,

    /// HTTP client instance that handles network communication.
    /// This is Zig's standard library HTTP client, which manages connections,
    /// connection pooling, and HTTP protocol details.
    client: http.Client,

    /// Base URL of the Ollama server (e.g., "http://localhost:11434").
    /// This is stored as a slice, so it must remain valid for the lifetime of the client.
    /// If provided as a string literal in `init()`, it will be valid for the program's lifetime.
    base_url: []const u8,

    /// Error union type that represents all possible errors this client can return.
    ///
    /// This is a union of:
    /// - Custom errors: RequestFailed, InvalidResponse, NetworkError, etc.
    /// - Standard library errors: Allocator errors, HTTP client errors, JSON parsing errors
    ///
    /// Using error unions allows Zig to statically check all error cases are handled.
    pub const Error = error{
        /// The HTTP request succeeded but returned a non-OK status code
        RequestFailed,
        /// The response body could not be parsed or was malformed
        InvalidResponse,
        /// A network-level error occurred (connection failed, timeout, etc.)
        NetworkError,
        /// Unexpected end of stream while reading response
        EndOfStream,
        /// Failed to read from the response stream
        ReadFailed,
        /// JSON parsing failed (malformed JSON, type mismatch, etc.)
        JsonParseError,
    } || mem.Allocator.Error || http.Client.RequestError || http.Client.FetchError || http.Client.ConnectError || json.ParseError(json.Scanner);

    /// Initialize a new Ollama client instance.
    ///
    /// Creates a new HTTP client and configures it to communicate with an Ollama server.
    /// The client will use connection pooling for efficiency.
    ///
    /// # Parameters
    /// - `allocator`: Memory allocator to use for all allocations. Must remain valid for the
    ///   lifetime of the client. Common choices: `std.heap.page_allocator`, `std.heap.GeneralPurposeAllocator`.
    /// - `base_url`: Optional base URL of the Ollama server. If `null`, defaults to `"http://localhost:11434"`.
    ///   Must remain valid for the lifetime of the client.
    ///
    /// # Returns
    /// A new `Ollama` instance, or an error if initialization fails.
    ///
    /// # Memory Safety
    /// The `base_url` slice must remain valid for the lifetime of the client. If you pass a
    /// string literal, it will be valid for the program's lifetime. If you pass an allocated
    /// string, you're responsible for freeing it after `deinit()` is called.
    pub fn init(allocator: mem.Allocator, base_url: ?[]const u8) !Ollama {
        return .{
            .allocator = allocator,
            // Initialize HTTP client with the same allocator.
            // The client uses this allocator for connection pooling and internal buffers.
            .client = http.Client{ .allocator = allocator },
            // Use the provided base_url or default to localhost.
            // The `orelse` operator provides a default value for optional types.
            .base_url = base_url orelse "http://localhost:11434",
        };
    }

    /// Clean up resources associated with the client.
    ///
    /// This should be called when the client is no longer needed. It will:
    /// - Close all HTTP connections in the connection pool
    /// - Free any resources held by the HTTP client
    ///
    /// # Safety
    /// After calling this, the client must not be used. Note that this does NOT free
    /// the `base_url` slice if it was allocated by the caller.
    pub fn deinit(self: *Ollama) void {
        // Deinitialize the HTTP client, closing connections and freeing resources.
        // This does NOT free self.base_url - the caller owns that.
        self.client.deinit();
    }

    /// Message structure representing a single message in a chat conversation.
    ///
    /// Used in the `chat()` function to provide conversation history and context.
    /// Each message has a role (e.g., "user", "assistant", "system") and content.
    ///
    /// # Memory Safety
    /// The `content` and `role` slices must remain valid for the duration of the `chat()` call.
    /// They are not copied by the client - they're directly used to build the JSON request.
    pub const Message = struct {
        /// The message content/text
        content: []const u8,
        /// The role of the message sender (typically "user", "assistant", or "system")
        role: []const u8,
    };

    /// Configuration options for text generation requests.
    ///
    /// These options control how the model generates text. All fields except `model` and `stream`
    /// are optional (nullable). The client only includes non-null fields in the JSON request.
    pub const GenerateOptions = struct {
        /// Name of the model to use for generation
        model: []const u8 = "granite4:tiny-h",
        /// Sampling temperature (0.0 to 1.0). Higher values make output more random.
        /// Typical values: 0.7-0.9 for creative tasks, 0.1-0.3 for focused tasks.
        temperature: ?f32 = null,
        /// Nucleus sampling parameter. Controls diversity via nucleus sampling.
        /// Typical values: 0.9-0.95. Only considers tokens with top_p probability mass.
        top_p: ?f32 = null,
        /// Limits sampling to top K most likely tokens. Reduces randomness.
        /// Typical values: 10-50. Set to 1 for greedy decoding (most likely token).
        top_k: ?i32 = null,
        /// Maximum number of tokens to generate. Limits response length.
        num_predict: ?i32 = null,
        /// Array of stop sequences. Generation stops when any sequence is encountered.
        /// Common examples: ["\n\n", "Human:", "Assistant:"]
        stop: ?[]const []const u8 = null,
        /// Random seed for reproducible outputs. If set, same prompt + seed = same output.
        seed: ?i32 = null,
        /// If true, returns a stream of response chunks. If false, returns complete response.
        /// Note: This implementation only supports non-streaming mode (stream=false).
        stream: bool = false,
    };

    /// Configuration options for chat requests.
    ///
    /// Similar to `GenerateOptions` but used for chat-style conversations with message history.
    /// See `GenerateOptions` documentation for parameter descriptions.
    pub const ChatOptions = struct {
        /// Name of the model to use for chat
        model: []const u8 = "granite4:tiny-h",
        /// Sampling temperature (0.0 to 1.0)
        temperature: ?f32 = null,
        /// Nucleus sampling parameter
        top_p: ?f32 = null,
        /// Top-K sampling parameter
        top_k: ?i32 = null,
        /// Maximum number of tokens to generate
        num_predict: ?i32 = null,
        /// Array of stop sequences
        stop: ?[]const []const u8 = null,
        /// Random seed for reproducibility
        seed: ?i32 = null,
        /// Whether to stream responses (currently only false is supported)
        stream: bool = false,
    };

    /// Response structure returned from text generation requests.
    ///
    /// Contains the generated text, model name, and completion status.
    /// All string fields are heap-allocated and must be freed using `deinit()`.
    pub const GenerateResponse = struct {
        /// The generated text response from the model
        response: []const u8,
        /// Name of the model that generated the response
        model: []const u8,
        /// Whether generation is complete (always true for non-streaming mode)
        done: bool,
        /// Allocator used for this response's memory (needed for deinit)
        allocator: mem.Allocator,

        /// Free all heap-allocated memory in this response.
        ///
        /// This must be called when the response is no longer needed to prevent memory leaks.
        /// After calling this, the response struct should not be used.
        pub fn deinit(self: GenerateResponse) void {
            self.allocator.free(self.response);
            self.allocator.free(self.model);
        }
    };

    /// Response structure returned from chat requests.
    ///
    /// Contains the assistant's message, role, and completion status.
    /// All string fields are heap-allocated and must be freed using `deinit()`.
    pub const ChatResponse = struct {
        /// The assistant's message (contains role and content)
        message: Message,
        /// Whether the chat response is complete (always true for non-streaming mode)
        done: bool,
        /// Allocator used for this response's memory (needed for deinit)
        allocator: mem.Allocator,

        /// Free all heap-allocated memory in this response.
        ///
        /// This must be called when the response is no longer needed to prevent memory leaks.
        pub fn deinit(self: ChatResponse) void {
            self.allocator.free(self.message.role);
            self.allocator.free(self.message.content);
        }
    };

    /// Information about a single Ollama model.
    ///
    /// Returned by `listModels()` to describe available models.
    /// All string fields are heap-allocated and must be freed using `deinit()`.
    pub const ModelInfo = struct {
        /// Model name/identifier (e.g., "granite4:tiny-h")
        name: []const u8,
        /// ISO 8601 timestamp of when the model was last modified
        modified_at: []const u8,
        /// Size of the model in bytes
        size: i64,
        /// Allocator used for this model info's memory (needed for deinit)
        allocator: mem.Allocator,

        /// Free all heap-allocated memory in this model info.
        ///
        /// This must be called when the model info is no longer needed to prevent memory leaks.
        pub fn deinit(self: ModelInfo) void {
            self.allocator.free(self.name);
            self.allocator.free(self.modified_at);
        }
    };

    /// Response structure returned from listing available models.
    ///
    /// Contains an array of `ModelInfo` structures describing all available models.
    /// Both the array and all model info strings must be freed using `deinit()`.
    pub const ListResponse = struct {
        /// Array of model information structures
        models: []ModelInfo,
        /// Allocator used for this response's memory (needed for deinit)
        allocator: mem.Allocator,

        /// Free all heap-allocated memory in this response.
        ///
        /// This recursively frees all model info structures and then frees the models array.
        /// This must be called when the response is no longer needed to prevent memory leaks.
        pub fn deinit(self: ListResponse) void {
            // Free each model's strings
            for (self.models) |model| {
                model.deinit();
            }
            // Free the models array itself
            self.allocator.free(self.models);
        }
    };

    /// Generate text completion using the Ollama API.
    ///
    /// This function sends a prompt to the Ollama server and returns the model's text completion.
    /// It's designed for simple prompt-response scenarios without conversation history.
    ///
    /// # Parameters
    /// - `prompt`: The text prompt to send to the model. Will be JSON-escaped automatically.
    /// - `options`: Configuration options for generation (model, temperature, etc.)
    ///
    /// # Returns
    /// A `GenerateResponse` containing the generated text, model name, and completion status.
    /// The response must be freed using `deinit()` when no longer needed.
    ///
    /// # Implementation Details
    ///
    /// The function follows this flow:
    /// 1. Builds the API URL by concatenating base_url + "/api/generate"
    /// 2. Constructs JSON request body manually (for efficiency and control)
    /// 3. Escapes special characters in the prompt using `jsonEscape()`
    /// 4. Conditionally adds optional parameters (temperature, top_p, etc.) only if set
    /// 5. Sends HTTP POST request with JSON body
    /// 6. Reads response into a 10MB stack buffer (sufficient for most responses)
    /// 7. Parses JSON response using `std.json`
    /// 8. Extracts strings and duplicates them (since JSON parser's strings are tied to the buffer)
    ///
    /// # Memory Management
    /// - URL and request body buffers are automatically freed via `defer`
    /// - Response buffer is on the stack (10MB)
    /// - Returned strings are heap-allocated and must be freed
    ///
    /// # Error Handling
    /// Returns errors for network failures, HTTP errors, JSON parsing errors, or allocation failures.
    pub fn generate(
        self: *Ollama,
        prompt: []const u8,
        options: GenerateOptions,
    ) Error!GenerateResponse {
        // Build URL by concatenating base_url with the API endpoint.
        // Using ArrayListUnmanaged for efficiency - we manage the allocator ourselves.
        var url_buffer = std.ArrayListUnmanaged(u8){};
        defer url_buffer.deinit(self.allocator);

        // Format the full URL: "{base_url}/api/generate"
        // The writer pattern allows efficient string formatting
        try url_buffer.writer(self.allocator).print("{s}/api/generate", .{self.base_url});
        const url = url_buffer.items;

        // Build JSON request body incrementally.
        // We construct JSON manually rather than using a JSON library for:
        // - Better performance (no intermediate structures)
        // - More control over formatting
        // - Avoiding external dependencies
        var request_body_buffer = std.ArrayListUnmanaged(u8){};
        defer request_body_buffer.deinit(self.allocator);

        // Start JSON object with model and prompt fields
        try request_body_buffer.appendSlice(self.allocator, "{\"model\":\"");
        try request_body_buffer.appendSlice(self.allocator, options.model);
        try request_body_buffer.appendSlice(self.allocator, "\",\"prompt\":\"");
        // Escape special characters in the prompt (quotes, newlines, etc.)
        try self.jsonEscape(&request_body_buffer, prompt);
        try request_body_buffer.appendSlice(self.allocator, "\"");

        // Conditionally add optional parameters.
        // The `if (options.field) |value|` syntax is Zig's optional unwrapping pattern.
        // It only executes if the value is not null.
        if (options.temperature) |temp| {
            // Format float with {d} specifier (decimal representation)
            try request_body_buffer.writer(self.allocator).print(",\"temperature\":{d}", .{temp});
        }
        if (options.top_p) |top_p| {
            try request_body_buffer.writer(self.allocator).print(",\"top_p\":{d}", .{top_p});
        }
        if (options.top_k) |top_k| {
            try request_body_buffer.writer(self.allocator).print(",\"top_k\":{d}", .{top_k});
        }
        if (options.num_predict) |num| {
            try request_body_buffer.writer(self.allocator).print(",\"num_predict\":{d}", .{num});
        }
        if (options.seed) |seed| {
            try request_body_buffer.writer(self.allocator).print(",\"seed\":{d}", .{seed});
        }
        // Stream is always included (boolean, not optional)
        try request_body_buffer.writer(self.allocator).print(",\"stream\":{}", .{options.stream});
        try request_body_buffer.appendSlice(self.allocator, "}");

        const request_body = request_body_buffer.items;

        // Parse the URL string into a URI structure.
        // This validates the URL format and breaks it into components.
        const uri = try std.Uri.parse(url);

        // Create HTTP POST request.
        // The `.POST` is a compile-time enum value specifying the HTTP method.
        // The third parameter is request options (headers, etc.).
        var req = try self.client.request(.POST, uri, .{
            .extra_headers = &.{
                // Content-Type header tells the server we're sending JSON
                .{ .name = "Content-Type", .value = "application/json" },
            },
        });
        defer req.deinit(); // Ensure request is cleaned up even on error

        // NOTE: This function uses the older sendBody API which may not work correctly
        // in Zig 0.15.2. The chat() function uses the newer BodyWriter API.
        // See chat() implementation for the correct pattern.
        std.debug.print("Generate: Sending body of length: {}\n", .{request_body.len});
        const bytes_sent = try req.sendBody(request_body);
        try req.finish(); // This may not exist in Zig 0.15.2 - see chat() for correct API
        std.debug.print("Generate: Sent {} bytes and finalized\n", .{bytes_sent});
        var response = try req.receiveHead(&.{});

        // Check HTTP status code.
        // Ollama returns .ok (200) for successful requests.
        if (response.head.status != .ok) {
            std.debug.print("Generate request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }

        // Read response body into stack buffer.
        // 10MB buffer should be sufficient for most model responses.
        // Using stack allocation avoids heap allocation for the common case.
        var response_buffer: [1024 * 1024 * 10]u8 = undefined;
        // Create a writer that writes to the fixed buffer
        var response_writer: std.Io.Writer = .fixed(&response_buffer);
        // Read buffer for the HTTP response reader (4KB chunks)
        var read_buffer: [4096]u8 = undefined;
        // Get a reader interface from the HTTP response
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        // Stream the entire response body into our buffer.
        // The @enumFromInt converts the buffer length to the appropriate enum type.
        const n = try body_reader.stream(&response_writer, @enumFromInt(response_buffer.len));

        // Parse JSON response.
        // The JSON parser expects a slice, so we slice the buffer to the actual size read.
        const response_json = response_buffer[0..n];
        std.debug.print("Generate response body ({} bytes): {s}\n", .{ n, response_json });
        // Parse JSON into a Value tree structure
        const parsed = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{}, // Parse options (using defaults)
        );
        defer parsed.deinit(); // Free parsed JSON tree

        // Extract values from JSON object.
        // The `?.` operator is optional chaining - if get() returns null, the whole expression is null.
        const obj = parsed.value.object;
        const response_text = obj.get("response").?.string;
        const model_name = obj.get("model").?.string;
        const done = obj.get("done").?.boolean;

        // Create response struct with duplicated strings.
        // The strings from JSON parsing are slices into the response_buffer, which will go out of scope.
        // We must duplicate them to heap-allocated memory that persists after the function returns.
        return GenerateResponse{
            .response = try self.allocator.dupe(u8, response_text),
            .model = try self.allocator.dupe(u8, model_name),
            .done = done,
            .allocator = self.allocator,
        };
    }

    /// Chat with the model using conversation history.
    ///
    /// This function sends a conversation (array of messages) to the Ollama server and returns
    /// the assistant's response. Unlike `generate()`, this supports multi-turn conversations
    /// with message history.
    ///
    /// # Parameters
    /// - `messages`: Array of `Message` structures representing the conversation history.
    ///   Messages are processed in order, so typically you'd include previous user/assistant
    ///   exchanges to maintain context.
    /// - `options`: Configuration options for chat generation (model, temperature, etc.)
    ///
    /// # Returns
    /// A `ChatResponse` containing the assistant's message, role, and completion status.
    /// The response must be freed using `deinit()` when no longer needed.
    ///
    /// # Implementation Details
    ///
    /// The function follows this flow:
    /// 1. Builds the API URL: base_url + "/api/chat"
    /// 2. Constructs JSON request body with a messages array
    /// 3. Iterates through messages, JSON-escaping each one
    /// 4. Sets transfer encoding to content-length (required for POST requests in Zig 0.15.2)
    /// 5. Uses BodyWriter API to send the request body (correct pattern for Zig 0.15.2)
    /// 6. Reads and parses JSON response
    /// 7. Extracts nested message object from response
    ///
    /// # Key Differences from generate()
    /// - Uses BodyWriter API (`sendBody()` returns a writer, must call `end()` after writing)
    /// - Sets `transfer_encoding` field explicitly (required for POST body)
    /// - Handles nested JSON structure (response contains a "message" object)
    /// - Uses heap allocation for response buffer (vs stack in generate)
    ///
    /// # Memory Management
    /// - URL and request body buffers are automatically freed via `defer`
    /// - Response buffer is heap-allocated (10MB) and freed via `defer`
    /// - Returned strings are heap-allocated and must be freed
    pub fn chat(
        self: *Ollama,
        messages: []const Message,
        options: ChatOptions,
    ) Error!ChatResponse {
        // Build URL for chat endpoint
        var url_buffer = std.ArrayListUnmanaged(u8){};
        defer url_buffer.deinit(self.allocator);

        try url_buffer.writer(self.allocator).print("{s}/api/chat", .{self.base_url});
        const url = url_buffer.items;

        // Build JSON request body with messages array
        var request_body_buffer = std.ArrayListUnmanaged(u8){};
        defer request_body_buffer.deinit(self.allocator);

        // Start JSON object with model and messages array
        try request_body_buffer.appendSlice(self.allocator, "{\"model\":\"");
        try request_body_buffer.appendSlice(self.allocator, options.model);
        try request_body_buffer.appendSlice(self.allocator, "\",\"messages\":[");

        // Iterate through messages with index.
        // The syntax `for (messages, 0..) |msg, i|` provides both the message and its index.
        // We use the index to add commas between messages (not before the first one).
        for (messages, 0..) |msg, i| {
            // Add comma separator between messages (not before the first)
            if (i > 0) try request_body_buffer.appendSlice(self.allocator, ",");
            // Build message object: {"role":"...", "content":"..."}
            try request_body_buffer.appendSlice(self.allocator, "{\"role\":\"");
            try request_body_buffer.appendSlice(self.allocator, msg.role);
            try request_body_buffer.appendSlice(self.allocator, "\",\"content\":\"");
            // Escape special characters in message content
            try self.jsonEscape(&request_body_buffer, msg.content);
            try request_body_buffer.appendSlice(self.allocator, "\"}");
        }

        // Close messages array
        try request_body_buffer.appendSlice(self.allocator, "]");

        // Add optional parameters (same pattern as generate())
        if (options.temperature) |temp| {
            try request_body_buffer.writer(self.allocator).print(",\"temperature\":{d}", .{temp});
        }
        if (options.top_p) |top_p| {
            try request_body_buffer.writer(self.allocator).print(",\"top_p\":{d}", .{top_p});
        }
        if (options.top_k) |top_k| {
            try request_body_buffer.writer(self.allocator).print(",\"top_k\":{d}", .{top_k});
        }
        if (options.num_predict) |num| {
            try request_body_buffer.writer(self.allocator).print(",\"num_predict\":{d}", .{num});
        }
        if (options.seed) |seed| {
            try request_body_buffer.writer(self.allocator).print(",\"seed\":{d}", .{seed});
        }
        try request_body_buffer.writer(self.allocator).print(",\"stream\":{}", .{options.stream});
        try request_body_buffer.appendSlice(self.allocator, "}");

        const request_body = request_body_buffer.items;

        std.debug.print("Chat request body: {s}\n", .{request_body});

        const uri = try std.Uri.parse(url);

        // Create HTTP POST request
        var req = try self.client.request(.POST, uri, .{
            .extra_headers = &.{
                .{ .name = "Content-Type", .value = "application/json" },
            },
        });
        defer req.deinit();

        // IMPORTANT: Set transfer encoding to content-length.
        // This tells the HTTP client how to encode the request body.
        // In Zig 0.15.2, this is required for POST requests with a body.
        // The client will automatically set the Content-Length header based on this.
        req.transfer_encoding = .{ .content_length = request_body.len };

        std.debug.print("Chat: Sending body of length: {}\n", .{request_body.len});

        // Send the request body using the BodyWriter API (correct pattern for Zig 0.15.2).
        // The `sendBody(&.{})` call returns a BodyWriter struct that we use to write the body.
        // The empty array `&.{}` is passed as a write buffer parameter.
        var body_writer = try req.sendBody(&.{});
        // Write the entire request body at once
        try body_writer.writer.writeAll(request_body);
        // Signal that we're done writing the body. This finalizes the request.
        try body_writer.end();

        std.debug.print("Chat: Body sent successfully\n", .{});

        // Receive HTTP response headers.
        // The empty array `&.{}` is a redirect buffer (not needed here).
        var response = try req.receiveHead(&.{});

        // Read response body into heap-allocated buffer.
        // We use heap allocation here (vs stack in generate) because chat responses
        // can potentially be larger, and we want to avoid stack overflow risks.
        const response_buffer = try self.allocator.alloc(u8, 10 * 1024 * 1024);
        defer self.allocator.free(response_buffer);
        var response_writer: std.Io.Writer = .fixed(response_buffer);
        var read_buffer: [4096]u8 = undefined;
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        // Stream remaining response body into our buffer.
        // `streamRemaining` reads until EOF (end of stream).
        const n = try body_reader.streamRemaining(&response_writer);

        // Parse JSON response
        const response_json = response_buffer[0..n];
        std.debug.print("Chat response status: {}, body ({} bytes): {s}\n", .{ response.head.status, n, response_json });

        if (response.head.status != .ok) {
            std.debug.print("Chat request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }
        const parsed = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{},
        );
        defer parsed.deinit();

        // Extract nested message object from response.
        // Chat API returns: {"message": {"role": "...", "content": "..."}, "done": true}
        const obj = parsed.value.object;
        const msg_obj = obj.get("message").?.object;
        const role = msg_obj.get("role").?.string;
        const content = msg_obj.get("content").?.string;
        const done = obj.get("done").?.bool;

        // Create response with duplicated strings (same as generate())
        return ChatResponse{
            .message = Message{
                .role = try self.allocator.dupe(u8, role),
                .content = try self.allocator.dupe(u8, content),
            },
            .done = done,
            .allocator = self.allocator,
        };
    }

    /// List all available models from the Ollama server.
    ///
    /// This function queries the Ollama API to retrieve information about all installed
    /// models, including their names, sizes, and modification dates.
    ///
    /// # Returns
    /// A `ListResponse` containing an array of `ModelInfo` structures.
    /// The response must be freed using `deinit()` when no longer needed.
    ///
    /// # Implementation Details
    ///
    /// The function follows this flow:
    /// 1. Builds the API URL: base_url + "/api/tags" (Ollama's endpoint for listing models)
    /// 2. Creates HTTP GET request (no body needed)
    /// 3. Calls `sendBodiless()` since GET requests don't have bodies
    /// 4. Receives response headers
    /// 5. Reads response body containing JSON array of models
    /// 6. Parses JSON and extracts model information
    /// 7. Allocates array and duplicates all strings
    ///
    /// # Memory Management
    /// - URL buffer is automatically freed via `defer`
    /// - Response buffer is heap-allocated (10MB) and freed via `defer`
    /// - Model info array and all strings are heap-allocated and must be freed
    ///
    /// # Error Handling
    /// Returns errors for network failures, HTTP errors, JSON parsing errors, or allocation failures.
    pub fn listModels(self: *Ollama) Error!ListResponse {
        // Build URL for tags endpoint (Ollama's name for listing models)
        var url_buffer = std.ArrayListUnmanaged(u8){};
        defer url_buffer.deinit(self.allocator);

        try url_buffer.writer(self.allocator).print("{s}/api/tags", .{self.base_url});
        const url = url_buffer.items;

        const uri = try std.Uri.parse(url);

        // Create HTTP GET request (no body needed, so no headers required)
        var req = try self.client.request(.GET, uri, .{});
        defer req.deinit();

        // Send request without body (GET requests don't have bodies)
        // This is simpler than POST - no transfer encoding or body writing needed
        try req.sendBodiless();
        var response = try req.receiveHead(&.{});

        if (response.head.status != .ok) {
            std.debug.print("ListModels request failed with status: {}\n", .{response.head.status});
            return Error.RequestFailed;
        }

        // Read response body into heap-allocated buffer.
        // The response contains a JSON object with a "models" array.
        const response_buffer = try self.allocator.alloc(u8, 10 * 1024 * 1024);
        defer self.allocator.free(response_buffer);
        var response_writer: std.Io.Writer = .fixed(response_buffer);
        var read_buffer: [4096]u8 = undefined;
        const body_reader: *std.Io.Reader = response.reader(&read_buffer);

        const n = try body_reader.streamRemaining(&response_writer);

        // Parse JSON response
        const response_json = response_buffer[0..n];
        std.debug.print("ListModels response body ({} bytes): {s}\n", .{ n, response_json });
        const parsed = try json.parseFromSlice(
            json.Value,
            self.allocator,
            response_json,
            .{},
        );
        defer parsed.deinit();

        // Extract models array from JSON.
        // Response format: {"models": [{"name": "...", "modified_at": "...", "size": ...}, ...]}
        const obj = parsed.value.object;
        const models_array = obj.get("models").?.array;

        // Allocate array to hold all ModelInfo structures.
        // We know the size from the JSON array length.
        var models = try self.allocator.alloc(ModelInfo, models_array.items.len);

        // Iterate through JSON array items, extracting model information.
        // The syntax `for (models_array.items, 0..) |model_val, i|` provides both the value and index.
        for (models_array.items, 0..) |model_val, i| {
            const model_obj = model_val.object;

            // Create ModelInfo struct with duplicated strings.
            // The strings from JSON parsing are slices into the response_buffer, so we must duplicate them.
            models[i] = ModelInfo{
                .name = try self.allocator.dupe(u8, model_obj.get("name").?.string),
                .modified_at = try self.allocator.dupe(u8, model_obj.get("modified_at").?.string),
                .size = model_obj.get("size").?.integer,
                .allocator = self.allocator,
            };
        }

        return ListResponse{
            .models = models,
            .allocator = self.allocator,
        };
    }

    /// Helper function to escape special characters in JSON strings.
    ///
    /// This function ensures that user-provided text can be safely embedded in JSON
    /// by escaping characters that have special meaning in JSON strings.
    ///
    /// # Parameters
    /// - `buffer`: The buffer to append escaped characters to.
    ///   NOTE: Function signature expects `*std.ArrayList(u8)` but may be called with
    ///   `*std.ArrayListUnmanaged(u8)` - this may need type adjustment.
    /// - `text`: The text to escape
    ///
    /// # Escaped Characters
    /// - `"` (double quote) → `\"`
    /// - `\` (backslash) → `\\`
    /// - `\n` (newline) → `\n` (escaped representation)
    /// - `\r` (carriage return) → `\r` (escaped representation)
    /// - `\t` (tab) → `\t` (escaped representation)
    ///
    /// All other characters are passed through unchanged.
    ///
    /// # Implementation
    /// Uses a switch statement to handle each special character case, appending the
    /// escaped representation to the buffer. Non-special characters are appended directly.
    ///
    /// # Example
    /// Input: `Hello "world"\n`
    /// Output appended to buffer: `Hello \"world\"\\n`
    fn jsonEscape(self: *Ollama, buffer: *std.ArrayList(u8), text: []const u8) !void {
        // Iterate through each character in the input text
        for (text) |c| {
            switch (c) {
                // Escape double quotes (must be escaped in JSON strings)
                '"' => try buffer.appendSlice(self.allocator, "\\\""),
                // Escape backslashes (must be escaped since backslash is the escape character)
                '\\' => try buffer.appendSlice(self.allocator, "\\\\"),
                // Escape newlines (represented as \n in JSON)
                '\n' => try buffer.appendSlice(self.allocator, "\\n"),
                // Escape carriage returns (represented as \r in JSON)
                '\r' => try buffer.appendSlice(self.allocator, "\\r"),
                // Escape tabs (represented as \t in JSON)
                '\t' => try buffer.appendSlice(self.allocator, "\\t"),
                // All other characters pass through unchanged
                else => try buffer.append(self.allocator, c),
            }
        }
    }
};

// ============================================================================
// TESTS
// ============================================================================

test "OllamaClient - init and deinit" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    try testing.expect(client.base_url.len > 0);
    try testing.expectEqualStrings("http://localhost:11434", client.base_url);
}

test "Ollama - init with custom URL" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), "http://192.168.1.100:11434");
    defer client.deinit();

    try testing.expectEqualStrings("http://192.168.1.100:11434", client.base_url);
}

test "Ollama - Message structure" {
    const msg = Ollama.Message{
        .role = "user",
        .content = "Hello, Ollama!",
    };

    try testing.expectEqualStrings("user", msg.role);
    try testing.expectEqualStrings("Hello, Ollama!", msg.content);
}

test "Ollama - GenerateOptions defaults" {
    const opts = Ollama.GenerateOptions{};

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature == null);
    try testing.expect(opts.seed == null);
    try testing.expect(opts.stream == false);
}

test "Ollama - GenerateOptions custom" {
    const opts = Ollama.GenerateOptions{
        .model = "granite4:tiny-h",
        .temperature = 0.7,
        .seed = 123,
        .stream = false,
        .num_predict = 100,
    };

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature.? == 0.7);
    try testing.expect(opts.seed.? == 123);
    try testing.expect(opts.stream == false);
    try testing.expect(opts.num_predict.? == 100);
}

test "OllamaClient - ChatOptions defaults" {
    const opts = Ollama.ChatOptions{};

    try testing.expectEqualStrings("granite4:tiny-h", opts.model);
    try testing.expect(opts.temperature == null);
    try testing.expect(opts.seed == null);
    try testing.expect(opts.stream == false);
}

// Integration tests - require a running Ollama server
test "Ollama -listModels integration" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const list_response = try client.listModels();
    defer list_response.deinit();

    std.debug.print("\nAvailable models: {d}\n", .{list_response.models.len});
    for (list_response.models) |model| {
        std.debug.print("  - {s} (size: {d})\n", .{ model.name, model.size });
    }

    try testing.expect(list_response.models.len >= 0);
}

test "Ollama - generate integration" {
    if (true) return error.SkipZigTest; // Skip by default

    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const response = try client.generate("Say hello in one word", .{
        .model = "granite4:tiny-h",
    });
    defer response.deinit();

    std.debug.print("\nGenerate Response: {s}\n", .{response.response});
    std.debug.print("Model: {s}, Done: {}\n", .{ response.model, response.done });

    try testing.expect(response.response.len > 0);
    try testing.expect(response.done);
}

test "Ollama - chat integration" {
    var gpa = std.testing.allocator_instance;
    var client = try Ollama.init(gpa.allocator(), null);
    defer client.deinit();

    const messages = [_]Ollama.Message{
        .{ .role = "user", .content = "What is the capital of France?" },
    };

    const response = try client.chat(&messages, .{
        .model = "granite4:tiny-h",
    });
    defer response.deinit();

    std.debug.print("\nChat Response: {s}\n", .{response.message.content});
    std.debug.print("Role: {s}, Done: {}\n", .{ response.message.role, response.done });

    try testing.expect(response.message.content.len > 0);
    try testing.expectEqualStrings("assistant", response.message.role);
    try testing.expect(response.done);
}