summary refs log tree commit diff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Token.zig97
-rw-r--r--src/main.zig16
-rw-r--r--src/supported-languages1
3 files changed, 114 insertions, 0 deletions
diff --git a/src/Token.zig b/src/Token.zig
new file mode 100644
index 0000000..1289a0d
--- /dev/null
+++ b/src/Token.zig
@@ -0,0 +1,97 @@
+// Tokenizer
+// SPDX-FileCopyrightText: 2025 Nguyễn Gia Phong
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+const order = std.math.order;
+const std = @import("std");
+
+const Node = tree_sitter.Node;
+const Parser = tree_sitter.Parser;
+const Tree = tree_sitter.Tree;
+const TreeCursor = tree_sitter.TreeCursor;
+const tree_sitter = @import("tree-sitter");
+
+const CreateLanguage = @import("languages").Create;
+
+const Token = @This();
+text: []const u8,
+node: ?Node = null,
+
+const Iterator = struct {
+    text: []const u8,
+    parser: *Parser,
+    tree: *Tree,
+    cursor: TreeCursor,
+    next_node: ?Node = null,
+    pos: u32 = 0,
+
+    /// Returns the next tree-sitter node.
+    fn nextNode(self: *Iterator) ?Node {
+        if (self.next_node) |node| {
+            self.next_node = null;
+            return node;
+        }
+        const node = self.cursor.node();
+        return if (self.cursor.gotoFirstChild())
+            node
+        else if (self.cursor.gotoNextSibling())
+            node
+        else while (self.cursor.gotoParent()) {
+            if (self.cursor.gotoNextSibling())
+                break node;
+        } else null;
+    }
+
+    pub fn next(self: *Iterator) ?Token {
+        if (self.pos == self.text.len)
+            return null;
+        while (self.nextNode()) |node|
+            if (node.childCount() > 0) {
+                const start = node.startByte();
+                const end = node.endByte();
+                switch (order(self.pos, start)) {
+                    .lt => {
+                        defer self.pos = start;
+                        self.next_node = node;
+                        return .{ .text = self.text[self.pos..start] };
+                    },
+                    .eq => {
+                        defer self.pos = end;
+                        return .{ .text = self.text[start..end], .node = node };
+                    },
+                    .gt => unreachable,
+                }
+            };
+        switch (order(self.pos, self.text.len)) {
+            .lt => {
+                defer self.pos = @intCast(self.text.len);
+                return .{ .text = self.text[self.pos..] };
+            },
+            .eq => return null,
+            .gt => unreachable,
+        }
+    }
+
+    pub fn deinit(self: *Iterator) void {
+        self.cursor.destroy();
+        self.tree.destroy();
+        self.parser.getLanguage().?.destroy();
+        self.parser.destroy();
+        self.* = undefined;
+    }
+};
+
+/// Parse text in given language and return an iterator of tokens.
+pub fn ize(text: []const u8, createLanguage: CreateLanguage) error {
+    IncompatibleVersion,
+}!Iterator {
+    const parser = Parser.create();
+    try parser.setLanguage(createLanguage());
+    const tree = parser.parseString(text, null).?;
+    return .{
+        .text = text,
+        .parser = parser,
+        .tree = tree,
+        .cursor = tree.walk(),
+    };
+}
diff --git a/src/main.zig b/src/main.zig
new file mode 100644
index 0000000..c55186b
--- /dev/null
+++ b/src/main.zig
@@ -0,0 +1,16 @@
+// Entry point
+// SPDX-FileCopyrightText: 2025 Nguyễn Gia Phong
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+const std = @import("std");
+
+const Token = @import("Token.zig");
+const languages = @import("languages");
+
+pub fn main() !void {
+    const text = "int main()\n{\n\treturn 0;\n}\n";
+    var tokens = try Token.ize(text, languages.c);
+    defer tokens.deinit();
+    while (tokens.next()) |token|
+        std.debug.print("{s}", .{ token.text });
+}
diff --git a/src/supported-languages b/src/supported-languages
new file mode 100644
index 0000000..f2ad6c7
--- /dev/null
+++ b/src/supported-languages
@@ -0,0 +1 @@
+c