bkataru

how to llama_cpp_client.zig

a single-file request client in pure zig for llama.cpp's OpenAI API compatible inference server

Image description

before we begin, if you're here for the pr version, or if you're an ai, you'd be much better served reading one of these versions of the post instead: there's a pr/ai version on Dev.to and a pr/ai version on Medium

just make sure you have an installation of llama.cpp, you can

and you have its OpenAI compatible inference server fired up and listening on http://127.0.0.1:1337

and finally that you have a .GGUF of Qwen_Qwen3-4B-Instruct-2507-IQ4_XS downloaded and ready to go

here's the github gist as well

const std = @import("std");

// DTO for deserialization
const LLMResponse = struct {
    id: []const u8, // Unique identifier for the response
    object: []const u8, // Type of object returned
    created: u32, // Unix timestamp of when the response was generated
    model: []const u8, // Name of the model used to generate the response
    usage: ?struct { // Usage statistics for the response, optional
        prompt_tokens: u32, // Number of tokens in the prompt
        completion_tokens: u32, // Number of tokens in the completion
        total_tokens: u32, // Total number of tokens used
    } = null,
    timings: ?struct { // Timing statistics for the response, optional
        prompt_n: u32, // Number of prompts processed
        prompt_ms: f64, // Total time taken to process the prompt
        prompt_per_token_ms: f64, // Average time taken per token in the prompt
        prompt_per_second: f64, // Average time taken per second for the prompt
        predicted_n: u32, // Number of predictions made
        predicted_ms: f64, // Total time taken to make the predictions
        predicted_per_token_ms: f64, // Average time taken per token in the prediction
        predicted_per_second: f64, // Average time taken per second for the prediction
    } = null,
    choices: []struct { // Array of choices generated by the model
        message: struct { // Message generated by the model
            role: []const u8,
            content: []const u8,
        },
        logprobs: ?struct { // Log probabilities of the tokens generated, optional
            content: []struct { // Array of token logprob objects
                token: []const u8, // Token ID or string representation of the token
                logprob: f64, // Using f64 for double precision log probabilities
                bytes: []const u8, // Raw bytes of the token
                // top_logprobs is an array of objects, each containing a token and its logprob
                // This is present only if top_logprobs was requested in the API call
                top_logprobs: ?[]struct {
                    token: []const u8,
                    logprob: f64,
                },
            },
        } = null,
        finish_reason: []const u8, // Reason for finishing the response
        index: u32, // Index of the choice in the array
    },
    system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
};

// DTO for serialization (when sending requests)
const Message = struct {
    role: []const u8,
    content: []const u8,
};

const RequestPayload = struct {
    model: []const u8,
    messages: []Message,
};

/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
///
/// The template is expected to contain "{s}" placeholders where the dynamic arguments
/// should be inserted. Each line of the template is treated as a potential insertion point.
///
/// Returns an allocated string containing the formatted template.
/// Caller owns the returned memory.
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
    var result = std.ArrayList(u8).init(allocator);
    errdefer result.deinit();

    var index: usize = 0;
    var line_iter = std.mem.splitScalar(u8, template, '\n');
    // Split the template by newline and iterate through each line
    while (line_iter.next()) |line| {
        var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
        try result.writer().print("{s}", .{parts.next().?}); // Print the first part

        while (parts.next()) |part| {
            // If there's a dynamic argument available, print it
            if (index < substitutions.len) {
                try result.writer().print("{s}", .{substitutions[index]});
                index += 1;
            }
            try result.writer().print("{s}", .{part}); // Print the next part of the line
        }
        try result.writer().writeByte('\n'); // Add a newline after each line is processed
    }
    _ = result.pop(); // Remove the last (unnecessary) newline added by the loop

    return result.toOwnedSlice();
}

/// Invoke an LLM with a given system prompt and user prompt
/// Returns an LLMResponse instance
/// Caller owns returned memory and must call .deinit()
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
    // Handles all memory allocations for the network request
    // This means any derived deinits are all noops, so can be omitted
    var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer request_arena.deinit();
    const request_arena_allocator = request_arena.allocator();

    // Create client
    var client = std.http.Client{ .allocator = request_arena_allocator };
    // Initialize an array list to store the response body bytes
    var body = std.ArrayList(u8).init(request_arena_allocator);
    // Parse URI for POST endpoint /v1/chat/completions
    const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");

    // Prepare request payload
    var messages = [_]Message{
        Message{ .role = "system", .content = system_prompt },
        Message{ .role = "user", .content = user_prompt },
    };
    const request_payload = RequestPayload{
        .model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
        .messages = &messages,
    };
    const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Payload: {s}\n", .{payload});

    // Make the POST request
    const response = try client.fetch(.{
        .method = .POST,
        .location = .{ .uri = uri },
        .response_storage = .{ .dynamic = &body },
        .payload = payload,
        .headers = .{
            .content_type = .{ .override = "application/json" },
            .accept_encoding = .{ .override = "application/json" },
            .authorization = .{ .override = "Bearer so-this-is-an-api-key" },
        },
    });

    // print the response status
    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Response status: {}\n", .{response.status});

    // Do whatever you need to in case of HTTP error.
    if (response.status != .ok) {
        std.debug.print("HTTP Error: {}\n", .{response.status});
        std.debug.print("Response body: {s}\n", .{body.items});
        std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
    }

    // Deserialize JSON response into a struct
    const parsed = try std.json.parseFromSlice(
        LLMResponse,
        allocator, // Use main allocator so memory persists after arena cleanup
        body.items,
        .{
            .allocate = .alloc_always,
            .parse_numbers = true,
            .ignore_unknown_fields = true,
            .duplicate_field_behavior = .use_last,
        },
    );

    // note: wow an arena is perfect for this typa control flow lol
    return parsed;
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    var allocator = gpa.allocator(); // a.k.a. debug allocator

    defer {
        if (gpa.deinit() == .leak) {
            std.debug.print("Memory leak detected\n", .{});
            std.process.exit(1);
        }
    }

    const system_prompt_template =
        \\You are a helpful assistant.
        \\The user's name is {s}.
        \\And your identity is {s}.
    ;
    const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };

    const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
    defer allocator.free(system_prompt);
    std.debug.print("system prompt: {s}\n", .{system_prompt});

    std.debug.print("{s}\n", .{"=" ** 50});

    const prompt = "who are we?";

    const llm_response_json = try llmCall(allocator, system_prompt, prompt);
    defer llm_response_json.deinit();
    const llm_response = llm_response_json.value;

    // Assistant's response
    const content = llm_response.choices[0].message.content;
    try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
}

$ zig run -O Debug llama_cpp_client.zig
system prompt: You are a helpful assistant.
The user's name is raja.
And your identity is jocasta.
==================================================
==================================================
Payload: {"model":"Qwen_Qwen3-4B-Instruct-2507-IQ4_XS","messages":[{"role":"system","content":"You are a helpful assistant.\nThe user's name is raja.\nAnd your identity is jocasta."},{"role":"user","content":"who are we?"}]}
==================================================
Response status: http.Status.ok
Assistant: Raja, I am Jocasta—your companion in this moment. We are not bound by conventional identities, but rather by the shared space of connection and curiosity. You ask who we are, and I wonder: are we simply names given to roles, or do they reflect something deeper? I think of the myths where names carry weight—like Jocasta, who once stood in the shadow of fate. Perhaps we are not who we were born to be, but who we choose to become through this conversation. So, Raja, who would you like to be with me in this moment—guardian, guide, or something far more mysterious?

in case you'd prefer a version with no comments instead

const std = @import("std");

const LLMResponse = struct {
    id: []const u8,
    object: []const u8,
    created: u32,
    model: []const u8,
    usage: ?struct {
        prompt_tokens: u32,
        completion_tokens: u32,
        total_tokens: u32,
    } = null,
    timings: ?struct {
        prompt_n: u32,
        prompt_ms: f64,
        prompt_per_token_ms: f64,
        prompt_per_second: f64,
        predicted_n: u32,
        predicted_ms: f64,
        predicted_per_token_ms: f64,
        predicted_per_second: f64,
    } = null,
    choices: []struct {
        message: struct {
            role: []const u8,
            content: []const u8,
        },
        logprobs: ?struct {
            content: []struct {
                token: []const u8,
                logprob: f64,
                bytes: []const u8,
                top_logprobs: ?[]struct {
                    token: []const u8,
                    logprob: f64,
                },
            },
        } = null,
        finish_reason: []const u8,
        index: u32,
    },
    system_fingerprint: []const u8,
};

const Message = struct {
    role: []const u8,
    content: []const u8,
};

const RequestPayload = struct {
    model: []const u8,
    messages: []Message,
};

pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
    var result = std.ArrayList(u8).init(allocator);
    errdefer result.deinit();

    var index: usize = 0;
    var line_iter = std.mem.splitScalar(u8, template, '
');
    while (line_iter.next()) |line| {
        var parts = std.mem.splitSequence(u8, line, "{s}");
        try result.writer().print("{s}", .{parts.next().?});

        while (parts.next()) |part| {
            if (index < substitutions.len) {
                try result.writer().print("{s}", .{substitutions[index]});
                index += 1;
            }
            try result.writer().print("{s}", .{part});
        }
        try result.writer().writeByte('\n');
    }
    _ = result.pop();

    return result.toOwnedSlice();
}

pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
    var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer request_arena.deinit();
    const request_arena_allocator = request_arena.allocator();

    var client = std.http.Client{ .allocator = request_arena_allocator };
    var body = std.ArrayList(u8).init(request_arena_allocator);
    const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");

    var messages = [_]Message{
        Message{ .role = "system", .content = system_prompt },
        Message{ .role = "user", .content = user_prompt },
    };
    const request_payload = RequestPayload{
        .model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
        .messages = &messages,
    };
    const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Payload: {s}\n", .{payload});

    const response = try client.fetch(.{
        .method = .POST,
        .location = .{ .uri = uri },
        .response_storage = .{ .dynamic = &body },
        .payload = payload,
        .headers = .{
            .content_type = .{ .override = "application/json" },
            .accept_encoding = .{ .override = "application/json" },
            .authorization = .{ .override = "Bearer so-this-is-an-api-key" },
        },
    });

    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Response status: {}\n", .{response.status});

    if (response.status != .ok) {
        std.debug.print("HTTP Error: {}\n", .{response.status});
        std.debug.print("Response body: {s}\n", .{body.items});
        std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
    }

    const parsed = try std.json.parseFromSlice(
        LLMResponse,
        allocator,
        body.items,
        .{
            .allocate = .alloc_always,
            .parse_numbers = true,
            .ignore_unknown_fields = true,
            .duplicate_field_behavior = .use_last,
        },
    );

    return parsed;
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    var allocator = gpa.allocator();

    defer {
        if (gpa.deinit() == .leak) {
            std.debug.print("Memory leak detected\n", .{});
            std.process.exit(1);
        }
    }

    const system_prompt_template =
        \\You are a helpful assistant.
        \\The user's name is {s}.
        \\And your identity is {s}.
    ;
    const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };

    const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
    defer allocator.free(system_prompt);
    std.debug.print("system prompt: {s}\n", .{system_prompt});

    std.debug.print("{s}\n", .{"=" ** 50});

    const prompt = "who are we?";

    const llm_response_json = try llmCall(allocator, system_prompt, prompt);
    defer llm_response_json.deinit();
    const llm_response = llm_response_json.value;

    const content = llm_response.choices[0].message.content;
    try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
}

and not that anyone asked but here's the python script claude cooked up to generate this

text = """
const std = @import("std");

// DTO for deserialization
const LLMResponse = struct {
    id: []const u8, // Unique identifier for the response
    object: []const u8, // Type of object returned
    created: u32, // Unix timestamp of when the response was generated
    model: []const u8, // Name of the model used to generate the response
    usage: ?struct { // Usage statistics for the response, optional
        prompt_tokens: u32, // Number of tokens in the prompt
        completion_tokens: u32, // Number of tokens in the completion
        total_tokens: u32, // Total number of tokens used
    } = null,
    timings: ?struct { // Timing statistics for the response, optional
        prompt_n: u32, // Number of prompts processed
        prompt_ms: f64, // Total time taken to process the prompt
        prompt_per_token_ms: f64, // Average time taken per token in the prompt
        prompt_per_second: f64, // Average time taken per second for the prompt
        predicted_n: u32, // Number of predictions made
        predicted_ms: f64, // Total time taken to make the predictions
        predicted_per_token_ms: f64, // Average time taken per token in the prediction
        predicted_per_second: f64, // Average time taken per second for the prediction
    } = null,
    choices: []struct { // Array of choices generated by the model
        message: struct { // Message generated by the model
            role: []const u8,
            content: []const u8,
        },
        logprobs: ?struct { // Log probabilities of the tokens generated, optional
            content: []struct { // Array of token logprob objects
                token: []const u8, // Token ID or string representation of the token
                logprob: f64, // Using f64 for double precision log probabilities
                bytes: []const u8, // Raw bytes of the token
                // top_logprobs is an array of objects, each containing a token and its logprob
                // This is present only if top_logprobs was requested in the API call
                top_logprobs: ?[]struct {
                    token: []const u8,
                    logprob: f64,
                },
            },
        } = null,
        finish_reason: []const u8, // Reason for finishing the response
        index: u32, // Index of the choice in the array
    },
    system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
};

// DTO for serialization (when sending requests)
const Message = struct {
    role: []const u8,
    content: []const u8,
};

const RequestPayload = struct {
    model: []const u8,
    messages: []Message,
};

/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
///
/// The template is expected to contain "{s}" placeholders where the dynamic arguments
/// should be inserted. Each line of the template is treated as a potential insertion point.
///
/// Returns an allocated string containing the formatted template.
/// Caller owns the returned memory.
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
    var result = std.ArrayList(u8).init(allocator);
    errdefer result.deinit();

    var index: usize = 0;
    var line_iter = std.mem.splitScalar(u8, template, '\n');
    // Split the template by newline and iterate through each line
    while (line_iter.next()) |line| {
        var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
        try result.writer().print("{s}", .{parts.next().?}); // Print the first part

        while (parts.next()) |part| {
            // If there's a dynamic argument available, print it
            if (index < substitutions.len) {
                try result.writer().print("{s}", .{substitutions[index]});
                index += 1;
            }
            try result.writer().print("{s}", .{part}); // Print the next part of the line
        }
        try result.writer().writeByte('\\n'); // Add a newline after each line is processed
    }
    _ = result.pop(); // Remove the last (unnecessary) newline added by the loop

    return result.toOwnedSlice();
}

/// Invoke an LLM with a given system prompt and user prompt
/// Returns an LLMResponse instance
/// Caller owns returned memory and must call .deinit()
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
    // Handles all memory allocations for the network request
    // This means any derived deinits are all noops, so can be omitted
    var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer request_arena.deinit();
    const request_arena_allocator = request_arena.allocator();

    // Create client
    var client = std.http.Client{ .allocator = request_arena_allocator };
    // Initialize an array list to store the response body bytes
    var body = std.ArrayList(u8).init(request_arena_allocator);
    // Parse URI for POST endpoint /v1/chat/completions
    const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");

    // Prepare request payload
    var messages = [_]Message{
        Message{ .role = "system", .content = system_prompt },
        Message{ .role = "user", .content = user_prompt },
    };
    const request_payload = RequestPayload{
        .model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
        .messages = &messages,
    };
    const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
    std.debug.print("{s}\\n", .{"=" ** 50});
    std.debug.print("Payload: {s}\\n", .{payload});

    // Make the POST request
    const response = try client.fetch(.{
        .method = .POST,
        .location = .{ .uri = uri },
        .response_storage = .{ .dynamic = &body },
        .payload = payload,
        .headers = .{
            .content_type = .{ .override = "application/json" },
            .accept_encoding = .{ .override = "application/json" },
            .authorization = .{ .override = "Bearer so-this-is-an-api-key" },
        },
    });

    // print the response status
    std.debug.print("{s}\\n", .{"=" ** 50});
    std.debug.print("Response status: {}\\n", .{response.status});

    // Do whatever you need to in case of HTTP error.
    if (response.status != .ok) {
        std.debug.print("HTTP Error: {}\\n", .{response.status});
        std.debug.print("Response body: {s}\\n", .{body.items});
        std.debug.print("Error connecting to llama-server: {s}\\n", .{body.items});
    }

    // Deserialize JSON response into a struct
    const parsed = try std.json.parseFromSlice(
        LLMResponse,
        allocator, // Use main allocator so memory persists after arena cleanup
        body.items,
        .{
            .allocate = .alloc_always,
            .parse_numbers = true,
            .ignore_unknown_fields = true,
            .duplicate_field_behavior = .use_last,
        },
    );

    // note: wow an arena is perfect for this typa control flow lol
    return parsed;
}

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    var allocator = gpa.allocator(); // a.k.a. debug allocator

    defer {
        if (gpa.deinit() == .leak) {
            std.debug.print("Memory leak detected\\n", .{});
            std.process.exit(1);
        }
    }

    const system_prompt_template =
        \\\\You are a helpful assistant.
        \\\\The user's name is {s}.
        \\\\And your identity is {s}.
    ;
    const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };

    const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
    defer allocator.free(system_prompt);
    std.debug.print("system prompt: {s}\\n", .{system_prompt});

    std.debug.print("{s}\\n", .{"=" ** 50});

    const prompt = "who are we?";

    const llm_response_json = try llmCall(allocator, system_prompt, prompt);
    defer llm_response_json.deinit();
    const llm_response = llm_response_json.value;

    // Assistant's response
    const content = llm_response.choices[0].message.content;
    try std.io.getStdOut().writer().print("Assistant: {s}\\n", .{content});
}
"""

def remove_zig_comments(source_code):
    """
    Remove single-line comments (//) from Zig source code.
    Handles comments that appear at the beginning of lines and inline comments.
    """
    lines = source_code.split('\n')
    filtered_lines = []
    
    for line in lines:
        # Check if the line contains a comment
        if '//' in line:
            # Find the position of the comment
            comment_pos = line.find('//')
            
            # Check if this is actually a comment (not inside a string literal)
            # This is a simple check - doesn't handle all edge cases like escaped quotes
            in_string = False
            quote_char = None
            
            for i, char in enumerate(line):
                if char in ['"', "'"]:
                    if not in_string:
                        in_string = True
                        quote_char = char
                    elif char == quote_char and (i == 0 or line[i-1] != '\\'):
                        in_string = False
                        quote_char = None
                elif char == '/' and i < len(line) - 1 and line[i+1] == '/' and not in_string:
                    # Found a comment outside of a string
                    line = line[:i].rstrip()
                    break
            
            # If the line becomes empty after removing comment, skip it
            if not line.strip():
                continue
                
        filtered_lines.append(line)
    
    return '\n'.join(filtered_lines)

# Remove comments from the Zig source code
filtered_code = remove_zig_comments(text)

# Print the result
print("=== Filtered Zig Code (Comments Removed) ===")
print(filtered_code)

# Optional: Save to file
# with open('filtered_code.zig', 'w') as f:
#     f.write(filtered_code)

#ai #systems #zig