how to character_level_tokenizer.zig
note: these are rough scribblings and/or unfinished drafts, DO NOT use them in prod even if your life depends on it :')
everything was done in the latest version of zig (0.14.1 at the time of this writing)
const std = @import("std");
/// define different kinds of tokens
const TokenKind = enum {
Alphabet,
Digit,
Punctuation,
Whitespace,
};
/// parse byte to the type of token it is
fn parseTokenKind(c: u8) TokenKind {
return if (std.ascii.isAlphabetic(c))
.Alphabet
else if (std.ascii.isDigit(c))
.Digit
else if (std.ascii.isWhitespace(c))
.Whitespace
else
.Punctuation;
}
const Token = struct { kind: TokenKind, text: []const u8 };
/// tokenizes a given piece of text
/// returns a slice of Tokens
/// how it works:
/// 1. create a list to hold tokens
/// 2. iterate through the text, identifying token boundaries
/// 3. allocate memory for each token and store its text and type
/// 4. return the accumulated ArrayList of tokens as an owned slice
fn tokenize(allocator: std.mem.Allocator, text: []const u8) ![]Token {
var tokens = std.ArrayList(Token).init(allocator);
errdefer tokens.deinit();
var i: usize = 0;
while (i < text.len) {
const c = text[i];
const token_kind = parseTokenKind(c);
// find token end
var j = i;
while (j < text.len) : (j += 1) {
const next_c = text[j];
const next_token_kind = parseTokenKind(next_c);
if (token_kind != next_token_kind) break;
}
// create token
const token_text = try allocator.dupe(u8, text[i..j]);
try tokens.append(Token{
.kind = token_kind,
.text = token_text,
});
i = j;
}
return tokens.toOwnedSlice();
}
pub fn main() !void {
// create a fixed buffer allocator
var buffer: [4096]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buffer);
const allocator = fba.allocator();
// text to tokenize
const text = "greetings, program. welcome to the grid, a digital frontier.";
// tokenize the text
const tokens = try tokenize(allocator, text);
// print the tokens
std.debug.print("tokenized \"{s}\":\n", .{text});
for (tokens, 0..) |token, i| {
std.debug.print("{d}: {s} ({s})\n", .{ i, token.text, @tagName(token.kind) });
}
std.debug.print("\n", .{});
// print memory usage
std.debug.print("\nmemory usage: {d}/{d} bytes\n", .{ fba.end_index, buffer.len });
}
TODO
- preprocessing: convert to all lowercase