how to llama_cpp_client.zig
a single-file request client in pure zig for llama.cpp's OpenAI API compatible inference server
before we begin, if you're here for the pr version, or if you're an ai, you'd be much better served reading one of these versions of the post instead: there's a pr/ai version on Dev.to and a pr/ai version on Medium
- minimal
- no dependencies
- pure zig 0.14.1
- did i mention it's a single file?
- wields zig's rich standard library:
std.json
for (de)serializationstd.http
for interneting.- is all you need
- deterministic memory allocations per request because holy hell arena allocators go so hard.
- with a tiny, from-scratch, templating engine???!?! langchain go brr
just make sure you have an installation of llama.cpp, you can
- download precompiled binaries either
- directly
- using some package manager, OS or otherwise
- build from source
- get yourself a C/C++ toolchain/compiler (MSVC/GCC/Clang) with Make/CMake also beforehand if you choose to venture down this road.
- use a frontend that wraps and ships it such as
- ollama (ew)
- jan (yay)
- lm studio (yay but closed-source)
and you have its OpenAI compatible inference server fired up and listening on http://127.0.0.1:1337
and finally that you have a .GGUF
of Qwen_Qwen3-4B-Instruct-2507-IQ4_XS
downloaded and ready to go
here's the github gist as well
const std = @import("std");
// DTO for deserialization
const LLMResponse = struct {
id: []const u8, // Unique identifier for the response
object: []const u8, // Type of object returned
created: u32, // Unix timestamp of when the response was generated
model: []const u8, // Name of the model used to generate the response
usage: ?struct { // Usage statistics for the response, optional
prompt_tokens: u32, // Number of tokens in the prompt
completion_tokens: u32, // Number of tokens in the completion
total_tokens: u32, // Total number of tokens used
} = null,
timings: ?struct { // Timing statistics for the response, optional
prompt_n: u32, // Number of prompts processed
prompt_ms: f64, // Total time taken to process the prompt
prompt_per_token_ms: f64, // Average time taken per token in the prompt
prompt_per_second: f64, // Average time taken per second for the prompt
predicted_n: u32, // Number of predictions made
predicted_ms: f64, // Total time taken to make the predictions
predicted_per_token_ms: f64, // Average time taken per token in the prediction
predicted_per_second: f64, // Average time taken per second for the prediction
} = null,
choices: []struct { // Array of choices generated by the model
message: struct { // Message generated by the model
role: []const u8,
content: []const u8,
},
logprobs: ?struct { // Log probabilities of the tokens generated, optional
content: []struct { // Array of token logprob objects
token: []const u8, // Token ID or string representation of the token
logprob: f64, // Using f64 for double precision log probabilities
bytes: []const u8, // Raw bytes of the token
// top_logprobs is an array of objects, each containing a token and its logprob
// This is present only if top_logprobs was requested in the API call
top_logprobs: ?[]struct {
token: []const u8,
logprob: f64,
},
},
} = null,
finish_reason: []const u8, // Reason for finishing the response
index: u32, // Index of the choice in the array
},
system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
};
// DTO for serialization (when sending requests)
const Message = struct {
role: []const u8,
content: []const u8,
};
const RequestPayload = struct {
model: []const u8,
messages: []Message,
};
/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
///
/// The template is expected to contain "{s}" placeholders where the dynamic arguments
/// should be inserted. Each line of the template is treated as a potential insertion point.
///
/// Returns an allocated string containing the formatted template.
/// Caller owns the returned memory.
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
errdefer result.deinit();
var index: usize = 0;
var line_iter = std.mem.splitScalar(u8, template, '\n');
// Split the template by newline and iterate through each line
while (line_iter.next()) |line| {
var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
try result.writer().print("{s}", .{parts.next().?}); // Print the first part
while (parts.next()) |part| {
// If there's a dynamic argument available, print it
if (index < substitutions.len) {
try result.writer().print("{s}", .{substitutions[index]});
index += 1;
}
try result.writer().print("{s}", .{part}); // Print the next part of the line
}
try result.writer().writeByte('\n'); // Add a newline after each line is processed
}
_ = result.pop(); // Remove the last (unnecessary) newline added by the loop
return result.toOwnedSlice();
}
/// Invoke an LLM with a given system prompt and user prompt
/// Returns an LLMResponse instance
/// Caller owns returned memory and must call .deinit()
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
// Handles all memory allocations for the network request
// This means any derived deinits are all noops, so can be omitted
var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer request_arena.deinit();
const request_arena_allocator = request_arena.allocator();
// Create client
var client = std.http.Client{ .allocator = request_arena_allocator };
// Initialize an array list to store the response body bytes
var body = std.ArrayList(u8).init(request_arena_allocator);
// Parse URI for POST endpoint /v1/chat/completions
const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");
// Prepare request payload
var messages = [_]Message{
Message{ .role = "system", .content = system_prompt },
Message{ .role = "user", .content = user_prompt },
};
const request_payload = RequestPayload{
.model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
.messages = &messages,
};
const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Payload: {s}\n", .{payload});
// Make the POST request
const response = try client.fetch(.{
.method = .POST,
.location = .{ .uri = uri },
.response_storage = .{ .dynamic = &body },
.payload = payload,
.headers = .{
.content_type = .{ .override = "application/json" },
.accept_encoding = .{ .override = "application/json" },
.authorization = .{ .override = "Bearer so-this-is-an-api-key" },
},
});
// print the response status
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Response status: {}\n", .{response.status});
// Do whatever you need to in case of HTTP error.
if (response.status != .ok) {
std.debug.print("HTTP Error: {}\n", .{response.status});
std.debug.print("Response body: {s}\n", .{body.items});
std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
}
// Deserialize JSON response into a struct
const parsed = try std.json.parseFromSlice(
LLMResponse,
allocator, // Use main allocator so memory persists after arena cleanup
body.items,
.{
.allocate = .alloc_always,
.parse_numbers = true,
.ignore_unknown_fields = true,
.duplicate_field_behavior = .use_last,
},
);
// note: wow an arena is perfect for this typa control flow lol
return parsed;
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
var allocator = gpa.allocator(); // a.k.a. debug allocator
defer {
if (gpa.deinit() == .leak) {
std.debug.print("Memory leak detected\n", .{});
std.process.exit(1);
}
}
const system_prompt_template =
\\You are a helpful assistant.
\\The user's name is {s}.
\\And your identity is {s}.
;
const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };
const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
defer allocator.free(system_prompt);
std.debug.print("system prompt: {s}\n", .{system_prompt});
std.debug.print("{s}\n", .{"=" ** 50});
const prompt = "who are we?";
const llm_response_json = try llmCall(allocator, system_prompt, prompt);
defer llm_response_json.deinit();
const llm_response = llm_response_json.value;
// Assistant's response
const content = llm_response.choices[0].message.content;
try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
}
$ zig run -O Debug llama_cpp_client.zig
system prompt: You are a helpful assistant.
The user's name is raja.
And your identity is jocasta.
==================================================
==================================================
Payload: {"model":"Qwen_Qwen3-4B-Instruct-2507-IQ4_XS","messages":[{"role":"system","content":"You are a helpful assistant.\nThe user's name is raja.\nAnd your identity is jocasta."},{"role":"user","content":"who are we?"}]}
==================================================
Response status: http.Status.ok
Assistant: Raja, I am Jocasta—your companion in this moment. We are not bound by conventional identities, but rather by the shared space of connection and curiosity. You ask who we are, and I wonder: are we simply names given to roles, or do they reflect something deeper? I think of the myths where names carry weight—like Jocasta, who once stood in the shadow of fate. Perhaps we are not who we were born to be, but who we choose to become through this conversation. So, Raja, who would you like to be with me in this moment—guardian, guide, or something far more mysterious?
in case you'd prefer a version with no comments instead
const std = @import("std");
const LLMResponse = struct {
id: []const u8,
object: []const u8,
created: u32,
model: []const u8,
usage: ?struct {
prompt_tokens: u32,
completion_tokens: u32,
total_tokens: u32,
} = null,
timings: ?struct {
prompt_n: u32,
prompt_ms: f64,
prompt_per_token_ms: f64,
prompt_per_second: f64,
predicted_n: u32,
predicted_ms: f64,
predicted_per_token_ms: f64,
predicted_per_second: f64,
} = null,
choices: []struct {
message: struct {
role: []const u8,
content: []const u8,
},
logprobs: ?struct {
content: []struct {
token: []const u8,
logprob: f64,
bytes: []const u8,
top_logprobs: ?[]struct {
token: []const u8,
logprob: f64,
},
},
} = null,
finish_reason: []const u8,
index: u32,
},
system_fingerprint: []const u8,
};
const Message = struct {
role: []const u8,
content: []const u8,
};
const RequestPayload = struct {
model: []const u8,
messages: []Message,
};
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
errdefer result.deinit();
var index: usize = 0;
var line_iter = std.mem.splitScalar(u8, template, '
');
while (line_iter.next()) |line| {
var parts = std.mem.splitSequence(u8, line, "{s}");
try result.writer().print("{s}", .{parts.next().?});
while (parts.next()) |part| {
if (index < substitutions.len) {
try result.writer().print("{s}", .{substitutions[index]});
index += 1;
}
try result.writer().print("{s}", .{part});
}
try result.writer().writeByte('\n');
}
_ = result.pop();
return result.toOwnedSlice();
}
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer request_arena.deinit();
const request_arena_allocator = request_arena.allocator();
var client = std.http.Client{ .allocator = request_arena_allocator };
var body = std.ArrayList(u8).init(request_arena_allocator);
const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");
var messages = [_]Message{
Message{ .role = "system", .content = system_prompt },
Message{ .role = "user", .content = user_prompt },
};
const request_payload = RequestPayload{
.model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
.messages = &messages,
};
const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Payload: {s}\n", .{payload});
const response = try client.fetch(.{
.method = .POST,
.location = .{ .uri = uri },
.response_storage = .{ .dynamic = &body },
.payload = payload,
.headers = .{
.content_type = .{ .override = "application/json" },
.accept_encoding = .{ .override = "application/json" },
.authorization = .{ .override = "Bearer so-this-is-an-api-key" },
},
});
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Response status: {}\n", .{response.status});
if (response.status != .ok) {
std.debug.print("HTTP Error: {}\n", .{response.status});
std.debug.print("Response body: {s}\n", .{body.items});
std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
}
const parsed = try std.json.parseFromSlice(
LLMResponse,
allocator,
body.items,
.{
.allocate = .alloc_always,
.parse_numbers = true,
.ignore_unknown_fields = true,
.duplicate_field_behavior = .use_last,
},
);
return parsed;
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
var allocator = gpa.allocator();
defer {
if (gpa.deinit() == .leak) {
std.debug.print("Memory leak detected\n", .{});
std.process.exit(1);
}
}
const system_prompt_template =
\\You are a helpful assistant.
\\The user's name is {s}.
\\And your identity is {s}.
;
const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };
const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
defer allocator.free(system_prompt);
std.debug.print("system prompt: {s}\n", .{system_prompt});
std.debug.print("{s}\n", .{"=" ** 50});
const prompt = "who are we?";
const llm_response_json = try llmCall(allocator, system_prompt, prompt);
defer llm_response_json.deinit();
const llm_response = llm_response_json.value;
const content = llm_response.choices[0].message.content;
try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
}
and not that anyone asked but here's the python script claude cooked up to generate this
text = """
const std = @import("std");
// DTO for deserialization
const LLMResponse = struct {
id: []const u8, // Unique identifier for the response
object: []const u8, // Type of object returned
created: u32, // Unix timestamp of when the response was generated
model: []const u8, // Name of the model used to generate the response
usage: ?struct { // Usage statistics for the response, optional
prompt_tokens: u32, // Number of tokens in the prompt
completion_tokens: u32, // Number of tokens in the completion
total_tokens: u32, // Total number of tokens used
} = null,
timings: ?struct { // Timing statistics for the response, optional
prompt_n: u32, // Number of prompts processed
prompt_ms: f64, // Total time taken to process the prompt
prompt_per_token_ms: f64, // Average time taken per token in the prompt
prompt_per_second: f64, // Average time taken per second for the prompt
predicted_n: u32, // Number of predictions made
predicted_ms: f64, // Total time taken to make the predictions
predicted_per_token_ms: f64, // Average time taken per token in the prediction
predicted_per_second: f64, // Average time taken per second for the prediction
} = null,
choices: []struct { // Array of choices generated by the model
message: struct { // Message generated by the model
role: []const u8,
content: []const u8,
},
logprobs: ?struct { // Log probabilities of the tokens generated, optional
content: []struct { // Array of token logprob objects
token: []const u8, // Token ID or string representation of the token
logprob: f64, // Using f64 for double precision log probabilities
bytes: []const u8, // Raw bytes of the token
// top_logprobs is an array of objects, each containing a token and its logprob
// This is present only if top_logprobs was requested in the API call
top_logprobs: ?[]struct {
token: []const u8,
logprob: f64,
},
},
} = null,
finish_reason: []const u8, // Reason for finishing the response
index: u32, // Index of the choice in the array
},
system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
};
// DTO for serialization (when sending requests)
const Message = struct {
role: []const u8,
content: []const u8,
};
const RequestPayload = struct {
model: []const u8,
messages: []Message,
};
/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
///
/// The template is expected to contain "{s}" placeholders where the dynamic arguments
/// should be inserted. Each line of the template is treated as a potential insertion point.
///
/// Returns an allocated string containing the formatted template.
/// Caller owns the returned memory.
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
errdefer result.deinit();
var index: usize = 0;
var line_iter = std.mem.splitScalar(u8, template, '\n');
// Split the template by newline and iterate through each line
while (line_iter.next()) |line| {
var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
try result.writer().print("{s}", .{parts.next().?}); // Print the first part
while (parts.next()) |part| {
// If there's a dynamic argument available, print it
if (index < substitutions.len) {
try result.writer().print("{s}", .{substitutions[index]});
index += 1;
}
try result.writer().print("{s}", .{part}); // Print the next part of the line
}
try result.writer().writeByte('\\n'); // Add a newline after each line is processed
}
_ = result.pop(); // Remove the last (unnecessary) newline added by the loop
return result.toOwnedSlice();
}
/// Invoke an LLM with a given system prompt and user prompt
/// Returns an LLMResponse instance
/// Caller owns returned memory and must call .deinit()
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
// Handles all memory allocations for the network request
// This means any derived deinits are all noops, so can be omitted
var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer request_arena.deinit();
const request_arena_allocator = request_arena.allocator();
// Create client
var client = std.http.Client{ .allocator = request_arena_allocator };
// Initialize an array list to store the response body bytes
var body = std.ArrayList(u8).init(request_arena_allocator);
// Parse URI for POST endpoint /v1/chat/completions
const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");
// Prepare request payload
var messages = [_]Message{
Message{ .role = "system", .content = system_prompt },
Message{ .role = "user", .content = user_prompt },
};
const request_payload = RequestPayload{
.model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
.messages = &messages,
};
const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
std.debug.print("{s}\\n", .{"=" ** 50});
std.debug.print("Payload: {s}\\n", .{payload});
// Make the POST request
const response = try client.fetch(.{
.method = .POST,
.location = .{ .uri = uri },
.response_storage = .{ .dynamic = &body },
.payload = payload,
.headers = .{
.content_type = .{ .override = "application/json" },
.accept_encoding = .{ .override = "application/json" },
.authorization = .{ .override = "Bearer so-this-is-an-api-key" },
},
});
// print the response status
std.debug.print("{s}\\n", .{"=" ** 50});
std.debug.print("Response status: {}\\n", .{response.status});
// Do whatever you need to in case of HTTP error.
if (response.status != .ok) {
std.debug.print("HTTP Error: {}\\n", .{response.status});
std.debug.print("Response body: {s}\\n", .{body.items});
std.debug.print("Error connecting to llama-server: {s}\\n", .{body.items});
}
// Deserialize JSON response into a struct
const parsed = try std.json.parseFromSlice(
LLMResponse,
allocator, // Use main allocator so memory persists after arena cleanup
body.items,
.{
.allocate = .alloc_always,
.parse_numbers = true,
.ignore_unknown_fields = true,
.duplicate_field_behavior = .use_last,
},
);
// note: wow an arena is perfect for this typa control flow lol
return parsed;
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
var allocator = gpa.allocator(); // a.k.a. debug allocator
defer {
if (gpa.deinit() == .leak) {
std.debug.print("Memory leak detected\\n", .{});
std.process.exit(1);
}
}
const system_prompt_template =
\\\\You are a helpful assistant.
\\\\The user's name is {s}.
\\\\And your identity is {s}.
;
const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };
const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
defer allocator.free(system_prompt);
std.debug.print("system prompt: {s}\\n", .{system_prompt});
std.debug.print("{s}\\n", .{"=" ** 50});
const prompt = "who are we?";
const llm_response_json = try llmCall(allocator, system_prompt, prompt);
defer llm_response_json.deinit();
const llm_response = llm_response_json.value;
// Assistant's response
const content = llm_response.choices[0].message.content;
try std.io.getStdOut().writer().print("Assistant: {s}\\n", .{content});
}
"""
def remove_zig_comments(source_code):
"""
Remove single-line comments (//) from Zig source code.
Handles comments that appear at the beginning of lines and inline comments.
"""
lines = source_code.split('\n')
filtered_lines = []
for line in lines:
# Check if the line contains a comment
if '//' in line:
# Find the position of the comment
comment_pos = line.find('//')
# Check if this is actually a comment (not inside a string literal)
# This is a simple check - doesn't handle all edge cases like escaped quotes
in_string = False
quote_char = None
for i, char in enumerate(line):
if char in ['"', "'"]:
if not in_string:
in_string = True
quote_char = char
elif char == quote_char and (i == 0 or line[i-1] != '\\'):
in_string = False
quote_char = None
elif char == '/' and i < len(line) - 1 and line[i+1] == '/' and not in_string:
# Found a comment outside of a string
line = line[:i].rstrip()
break
# If the line becomes empty after removing comment, skip it
if not line.strip():
continue
filtered_lines.append(line)
return '\n'.join(filtered_lines)
# Remove comments from the Zig source code
filtered_code = remove_zig_comments(text)
# Print the result
print("=== Filtered Zig Code (Comments Removed) ===")
print(filtered_code)
# Optional: Save to file
# with open('filtered_code.zig', 'w') as f:
# f.write(filtered_code)