how to bag_of_words.zig
note: these are rough scribblings and/or unfinished drafts, DO NOT use them in prod even if your life depends on it :')
everything was done in the latest version of zig (0.14.1 at the time of this writing)
const std = @import("std");
pub const Vocabulary = struct {
words_list: std.ArrayList([]const u8),
pub fn init(allocator: std.mem.Allocator) Vocabulary {
const wl = std.ArrayList([]const u8).init(allocator);
return Vocabulary{ .words_list = wl };
}
pub fn deinit(self: *Vocabulary) void {
self.words_list.deinit();
}
pub fn words(self: Vocabulary) [][]const u8 {
return self.words_list.items;
}
pub fn search(self: Vocabulary, target: []const u8) ?usize {
var found: ?usize = null;
for (self.words_list.items, 0..) |word, index| {
if (std.mem.eql(u8, word, target)) {
found = index;
break;
}
}
return found;
}
pub fn add(self: *Vocabulary, token: []const u8) !void {
if (self.search(token) == null) {
try self.words_list.append(token);
}
}
pub fn remove(self: *Vocabulary, token: []const u8) void {
if (self.search(token)) |index| {
_ = self.words_list.swapRemove(index);
}
}
pub fn generateRepresentation(self: Vocabulary, allocator: std.mem.Allocator, tokens: [][]const u8) ![]const u8 {
const vector = try allocator.alloc(u8, self.words_list.items.len);
errdefer allocator.free(vector);
for (0..vector.len) |i| {
vector[i] = 0;
}
for (self.words_list.items, 0..) |word, index| {
for (tokens) |token| {
if (std.mem.eql(u8, word, token)) {
vector[index] += 1;
}
}
}
return vector;
}
};
fn tokenize(allocator: std.mem.Allocator, input: []const u8, separator: u8) ![][]const u8 {
var tokens = std.ArrayList([]const u8).init(allocator);
errdefer tokens.deinit();
var iterator = std.mem.splitScalar(u8, input, separator);
while (iterator.next()) |token| {
try tokens.append(token);
}
return try tokens.toOwnedSlice();
}
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
const allocator = arena.allocator();
var vocab = Vocabulary.init(allocator);
defer vocab.deinit();
for (try tokenize(allocator, "that is a cute dog", ' ')) |token| {
try vocab.add(token);
}
for (try tokenize(allocator, "my cat is cute", ' ')) |token| {
try vocab.add(token);
}
for (vocab.words()) |token| {
std.debug.print("vocab = {s}\n", .{token});
}
const vector = vocab.generateRepresentation(allocator, try tokenize(allocator, "my cat is cute", ' '));
std.debug.print("vector = {any}\n", .{vector});
}
TODO
- preprocessing: add more preprocessing steps
- make lowercase, handle punctuation, etc.
- handle multibyte, non-ascii chars