how to character_level_tokenizer.zig

06 Aug, 2025

note: these are rough scribblings and/or unfinished drafts, DO NOT use them in prod even if your life depends on it :')

everything was done in the latest version of zig (0.14.1 at the time of this writing)

const std = @import("std");

/// define different kinds of tokens
const TokenKind = enum {
    Alphabet,
    Digit,
    Punctuation,
    Whitespace,
};

/// parse byte to the type of token it is
fn parseTokenKind(c: u8) TokenKind {
    return if (std.ascii.isAlphabetic(c))
        .Alphabet
    else if (std.ascii.isDigit(c))
        .Digit
    else if (std.ascii.isWhitespace(c))
        .Whitespace
    else
        .Punctuation;
}

const Token = struct { kind: TokenKind, text: []const u8 };

/// tokenizes a given piece of text
/// returns a slice of Tokens
/// how it works:
/// 1. create a list to hold tokens
/// 2. iterate through the text, identifying token boundaries
/// 3. allocate memory for each token and store its text and type
/// 4. return the accumulated ArrayList of tokens as an owned slice
fn tokenize(allocator: std.mem.Allocator, text: []const u8) ![]Token {
    var tokens = std.ArrayList(Token).init(allocator);
    errdefer tokens.deinit();

    var i: usize = 0;
    while (i < text.len) {
        const c = text[i];
        const token_kind = parseTokenKind(c);

        // find token end
        var j = i;
        while (j < text.len) : (j += 1) {
            const next_c = text[j];
            const next_token_kind = parseTokenKind(next_c);
            if (token_kind != next_token_kind) break;
        }

        // create token
        const token_text = try allocator.dupe(u8, text[i..j]);
        try tokens.append(Token{
            .kind = token_kind,
            .text = token_text,
        });
        i = j;
    }

    return tokens.toOwnedSlice();
}

pub fn main() !void {
    // create a fixed buffer allocator
    var buffer: [4096]u8 = undefined;
    var fba = std.heap.FixedBufferAllocator.init(&buffer);
    const allocator = fba.allocator();

    // text to tokenize
    const text = "greetings, program. welcome to the grid, a digital frontier.";

    // tokenize the text
    const tokens = try tokenize(allocator, text);

    // print the tokens
    std.debug.print("tokenized \"{s}\":\n", .{text});
    for (tokens, 0..) |token, i| {
        std.debug.print("{d}: {s} ({s})\n", .{ i, token.text, @tagName(token.kind) });
    }
    std.debug.print("\n", .{});

    // print memory usage
    std.debug.print("\nmemory usage: {d}/{d} bytes\n", .{ fba.end_index, buffer.len });
}

TODO

preprocessing: convert to all lowercase