how to bag_of_words.zig

06 Aug, 2025

note: these are rough scribblings and/or unfinished drafts, DO NOT use them in prod even if your life depends on it :')

everything was done in the latest version of zig (0.14.1 at the time of this writing)

const std = @import("std");

pub const Vocabulary = struct {
    words_list: std.ArrayList([]const u8),

    pub fn init(allocator: std.mem.Allocator) Vocabulary {
        const wl = std.ArrayList([]const u8).init(allocator);
        return Vocabulary{ .words_list = wl };
    }

    pub fn deinit(self: *Vocabulary) void {
        self.words_list.deinit();
    }

    pub fn words(self: Vocabulary) [][]const u8 {
        return self.words_list.items;
    }

    pub fn search(self: Vocabulary, target: []const u8) ?usize {
        var found: ?usize = null;
        for (self.words_list.items, 0..) |word, index| {
            if (std.mem.eql(u8, word, target)) {
                found = index;
                break;
            }
        }
        return found;
    }

    pub fn add(self: *Vocabulary, token: []const u8) !void {
        if (self.search(token) == null) {
            try self.words_list.append(token);
        }
    }

    pub fn remove(self: *Vocabulary, token: []const u8) void {
        if (self.search(token)) |index| {
            _ = self.words_list.swapRemove(index);
        }
    }

    pub fn generateRepresentation(self: Vocabulary, allocator: std.mem.Allocator, tokens: [][]const u8) ![]const u8 {
        const vector = try allocator.alloc(u8, self.words_list.items.len);
        errdefer allocator.free(vector);

        for (0..vector.len) |i| {
            vector[i] = 0;
        }

        for (self.words_list.items, 0..) |word, index| {
            for (tokens) |token| {
                if (std.mem.eql(u8, word, token)) {
                    vector[index] += 1;
                }
            }
        }

        return vector;
    }
};

fn tokenize(allocator: std.mem.Allocator, input: []const u8, separator: u8) ![][]const u8 {
    var tokens = std.ArrayList([]const u8).init(allocator);
    errdefer tokens.deinit();

    var iterator = std.mem.splitScalar(u8, input, separator);

    while (iterator.next()) |token| {
        try tokens.append(token);
    }

    return try tokens.toOwnedSlice();
}

pub fn main() !void {
    var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer arena.deinit();
    const allocator = arena.allocator();

    var vocab = Vocabulary.init(allocator);
    defer vocab.deinit();

    for (try tokenize(allocator, "that is a cute dog", ' ')) |token| {
        try vocab.add(token);
    }

    for (try tokenize(allocator, "my cat is cute", ' ')) |token| {
        try vocab.add(token);
    }

    for (vocab.words()) |token| {
        std.debug.print("vocab = {s}\n", .{token});
    }

    const vector = vocab.generateRepresentation(allocator, try tokenize(allocator, "my cat is cute", ' '));
    std.debug.print("vector = {any}\n", .{vector});
}

TODO

preprocessing: add more preprocessing steps
- make lowercase, handle punctuation, etc.
- handle multibyte, non-ascii chars