//! This is a very simple gitignoire parser using already existing code for wildcard matching
//! it was easy to implement but it is not 100% optimized.
//!
//! There are a bunch of things other libs are doing to optimize ignoring like grouping, pattern combination,
//! caching and so on, imporving the performance but this is a good start for what it worth.
const std = @import("std");
const mem = std.mem;
const Allocator = std.mem.Allocator;
const glob = @import("zlob.zig");
const path_matcher = @import("path_matcher.zig");
/// A single gitignore pattern with pre-computed metadata
pub const Pattern = struct {
/// The pattern text (slice into source)
text: []const u8,
/// Pattern is negated (starts with !)
negated: bool,
/// Pattern only matches directories (ends with /)
dir_only: bool,
/// Pattern is anchored (contains / other than trailing)
anchored: bool,
/// Pattern contains ** for recursive matching
has_double_star: bool,
/// Pattern has no wildcards (can use literal matching)
is_literal: bool,
/// Pattern text contains / (requires full path matching if not anchored)
has_slash: bool,
/// For simple suffix patterns like *.rs - the suffix without * (e.g., ".rs")
/// Used for both suffix_patterns hash map grouping and fast matching
suffix: ?[]const u8,
/// Pre-computed suffix length for fast comparison
suffix_len: u8,
/// Pre-computed u32 of suffix for SIMD-style matching (suffixes <= 4 bytes)
suffix_u32: u32,
/// Index in original pattern list (for negation ordering)
index: u16,
};
/// String hash map for O(1) literal lookups
const StringHashMap = std.StringHashMap(PatternMatch);
const PatternMatch = struct {
negated: bool,
dir_only: bool,
index: u16,
};
/// Gitignore pattern set with optimized matching
pub const GitIgnore = struct {
/// All patterns for full matching
patterns: []Pattern,
/// Wildcard patterns only (excludes literals)
wildcard_patterns: []Pattern,
/// Literal directory patterns for O(1) lookup (e.g., "target", "node_modules")
literal_dirs: StringHashMap,
/// Literal file patterns for O(1) lookup
literal_files: StringHashMap,
/// Suffix patterns grouped by extension (e.g., ".rs" -> [pattern for *.rs])
suffix_patterns: std.StringHashMap([]Pattern),
/// Whether any negation pattern exists (enables early termination)
has_negations: bool,
/// Index of first negation pattern (for early termination)
first_negation_index: u16,
/// Allocator for cleanup
allocator: Allocator,
/// Original file content - patterns slice into this
source: []const u8,
/// Cache for directory decisions: path -> should_skip
dir_cache: std.StringHashMap(bool),
const Self = @This();
/// Load and parse .gitignore from current working directory
/// Returns null if no .gitignore file exists
pub fn loadFromCwd(allocator: Allocator, io: std.Io) !?Self {
const cwd = std.Io.Dir.cwd();
const content = cwd.readFileAlloc(io, ".gitignore", allocator, .limited(1024 * 1024)) catch |err| {
if (err == error.FileNotFound) return null;
if (err == error.StreamTooLong) return null;
return err;
};
return try parseOwned(allocator, content);
}
/// Load and parse .gitignore from a specific directory
/// Returns null if no .gitignore file exists
pub fn loadFromDir(allocator: Allocator, io: std.Io, dir_path: []const u8) !?Self {
var path_buf: [4096]u8 = undefined;
const gitignore_path = if (dir_path.len > 0 and !mem.eql(u8, dir_path, "."))
std.fmt.bufPrint(&path_buf, "{s}/.gitignore", .{dir_path}) catch return null
else
".gitignore";
const cwd = std.Io.Dir.cwd();
const content = cwd.readFileAlloc(io, gitignore_path, allocator, .limited(1024 * 1024)) catch |err| {
if (err == error.FileNotFound) return null;
if (err == error.StreamTooLong) return null;
return err;
};
return try parseOwned(allocator, content);
}
/// Check if a pattern text contains any glob wildcards (SIMD-accelerated)
fn hasWildcards(text: []const u8) bool {
return glob.hasWildcardsBasic(text);
}
/// Extract suffix from a simple *.ext pattern
fn extractSuffix(text: []const u8) ?[]const u8 {
// Must start with * and have no other wildcards
if (text.len < 2 or text[0] != '*') return null;
// Check it's not **
if (text.len >= 2 and text[1] == '*') return null;
const rest = text[1..];
// Rest must have no wildcards
if (hasWildcards(rest)) return null;
return rest;
}
/// Parse gitignore content - takes ownership of the content slice
fn parseOwned(allocator: Allocator, content: []const u8) !Self {
const PatternList = std.array_list.AlignedManaged(Pattern, null);
var patterns_list = PatternList.init(allocator);
defer patterns_list.deinit();
var wildcard_list = PatternList.init(allocator);
defer wildcard_list.deinit();
var literal_dirs = StringHashMap.init(allocator);
errdefer literal_dirs.deinit();
var literal_files = StringHashMap.init(allocator);
errdefer literal_files.deinit();
var suffix_map = std.StringHashMap(PatternList).init(allocator);
defer {
var iter = suffix_map.valueIterator();
while (iter.next()) |list| {
list.deinit();
}
suffix_map.deinit();
}
var has_negations = false;
var first_negation_index: u16 = std.math.maxInt(u16);
var index: u16 = 0;
var line_iter = mem.splitScalar(u8, content, '\n');
while (line_iter.next()) |raw_line| {
const line = if (raw_line.len > 0 and raw_line[raw_line.len - 1] == '\r')
raw_line[0 .. raw_line.len - 1]
else
raw_line;
if (parseLine(line, index)) |pattern| {
try patterns_list.append(pattern);
if (pattern.negated) {
has_negations = true;
if (index < first_negation_index) {
first_negation_index = index;
}
}
// Categorize pattern for optimized lookup
if (pattern.is_literal and !pattern.anchored) {
// Simple literal pattern - add to hash map
const match = PatternMatch{
.negated = pattern.negated,
.dir_only = pattern.dir_only,
.index = index,
};
if (pattern.dir_only) {
try literal_dirs.put(pattern.text, match);
} else {
try literal_files.put(pattern.text, match);
}
} else if (pattern.suffix) |suffix| {
// Suffix pattern like *.rs
const result = try suffix_map.getOrPut(suffix);
if (!result.found_existing) {
result.value_ptr.* = PatternList.init(allocator);
}
try result.value_ptr.append(pattern);
} else {
// Wildcard pattern OR anchored literal pattern
try wildcard_list.append(pattern);
}
index += 1;
}
}
// Convert suffix ArrayLists to slices
var suffix_patterns = std.StringHashMap([]Pattern).init(allocator);
errdefer suffix_patterns.deinit();
var suffix_iter = suffix_map.iterator();
while (suffix_iter.next()) |entry| {
const slice = try entry.value_ptr.toOwnedSlice();
try suffix_patterns.put(entry.key_ptr.*, slice);
}
return Self{
.patterns = try patterns_list.toOwnedSlice(),
.wildcard_patterns = try wildcard_list.toOwnedSlice(),
.literal_dirs = literal_dirs,
.literal_files = literal_files,
.suffix_patterns = suffix_patterns,
.has_negations = has_negations,
.first_negation_index = first_negation_index,
.allocator = allocator,
.source = content,
.dir_cache = std.StringHashMap(bool).init(allocator),
};
}
/// Parse gitignore content from a borrowed string (for testing)
pub fn parse(allocator: Allocator, content: []const u8) !Self {
const owned = try allocator.dupe(u8, content);
return try parseOwned(allocator, owned);
}
/// Parse a single line - returns pattern with text as slice into line
fn parseLine(line: []const u8, index: u16) ?Pattern {
var text = line;
if (text.len == 0 or text[0] == '#') {
return null;
}
// Trim trailing unescaped spaces
while (text.len > 0 and text[text.len - 1] == ' ') {
if (text.len >= 2 and text[text.len - 2] == '\\') break;
text = text[0 .. text.len - 1];
}
if (text.len == 0) return null;
var negated = false;
if (text[0] == '!') {
negated = true;
text = text[1..];
if (text.len == 0) return null;
}
var dir_only = false;
if (text[text.len - 1] == '/') {
dir_only = true;
text = text[0 .. text.len - 1];
if (text.len == 0) return null;
}
var anchored = false;
if (text[0] == '/') {
anchored = true;
text = text[1..];
} else {
for (text) |ch| {
if (ch == '/') {
anchored = true;
break;
}
}
}
const is_literal = !hasWildcards(text);
const suffix = if (!anchored) extractSuffix(text) else null;
const has_slash = glob.indexOfCharSIMD(text, '/') != null;
// Pre-compute suffix values for fast matching
var suffix_len: u8 = 0;
var suffix_u32: u32 = 0;
if (suffix) |s| {
suffix_len = @intCast(s.len);
if (s.len <= 4) {
@memcpy(@as([*]u8, @ptrCast(&suffix_u32))[0..s.len], s);
}
}
return Pattern{
.text = text,
.negated = negated,
.dir_only = dir_only,
.anchored = anchored,
.has_double_star = mem.indexOf(u8, text, "**") != null,
.is_literal = is_literal,
.has_slash = has_slash,
.suffix = suffix,
.suffix_len = suffix_len,
.suffix_u32 = suffix_u32,
.index = index,
};
}
/// Check if a path should be ignored - optimized version
pub fn isIgnored(self: *const Self, path: []const u8, is_dir: bool) bool {
// Fast path: skip ./ prefix if present (common case: no prefix)
const normalized_path = if (path.len > 2 and path[0] == '.' and path[1] == '/') path[2..] else path;
const basename = if (glob.lastIndexOfCharSIMD(normalized_path, '/')) |pos|
normalized_path[pos + 1 ..]
else
normalized_path;
// Fast path: if no negations exist, we can use optimized lookups
if (!self.has_negations) {
if (is_dir) {
if (self.literal_dirs.get(basename)) |_| {
return true;
}
}
if (self.literal_files.get(basename)) |match| {
if (!match.dir_only or is_dir) {
return true;
}
}
if (glob.lastIndexOfCharSIMD(basename, '.')) |dot_pos| {
const suffix = basename[dot_pos..];
if (self.suffix_patterns.get(suffix)) |patterns| {
// Any matching suffix pattern means ignored (check dir_only constraint)
for (patterns) |pattern| {
if (!pattern.dir_only or is_dir) return true;
}
}
}
// Check wildcard patterns - these require actual pattern matching
// OPTIMIZATION: Most wildcard patterns in typical .gitignore files
// are basename-only (no /), so we only need to check against basename
for (self.wildcard_patterns) |pattern| {
// For dir_only patterns checking a file: only skip if the pattern
// cannot match a parent directory. matchPatternFast handles the
// "is this file inside an ignored directory" check.
if (pattern.dir_only and !is_dir and !pattern.anchored and !pattern.has_slash) continue;
if (matchPatternFast(&pattern, normalized_path, basename)) {
return true;
}
}
return false;
}
// Slow path: has negations, must process all patterns in order
var ignored = false;
for (self.patterns) |pattern| {
// For dir_only patterns checking a file: only skip if the pattern
// cannot match a parent directory. matchPatternFast handles the
// "is this file inside an ignored directory" check.
if (pattern.dir_only and !is_dir and !pattern.anchored and !pattern.has_slash) continue;
if (matchPatternFast(&pattern, normalized_path, basename)) {
ignored = !pattern.negated;
}
}
return ignored;
}
inline fn matchPatternFast(pattern: *const Pattern, path: []const u8, basename: []const u8) bool {
const text = pattern.text;
// Anchored patterns match against full path only
if (pattern.anchored) {
// For directory patterns, also match paths that are inside the directory
// e.g., pattern "rust/target" should match "rust/target/debug/foo.rs"
if (pattern.dir_only) {
// Check exact match first
if (mem.eql(u8, text, path)) return true;
// Check if path is inside this directory (path starts with "pattern/")
if (path.len > text.len and
mem.startsWith(u8, path, text) and
path[text.len] == '/')
{
return true;
}
return false;
}
return path_matcher.matchGlobSimple(text, path);
}
// Non-anchored patterns without / match against basename only
if (!pattern.has_slash) {
// Fast path for simple suffix patterns (*.o, *.rs)
// Use SIMD-style matching with pre-computed u32
if (pattern.suffix_len > 0) {
if (basename.len < pattern.suffix_len) return false;
const suffix_len = pattern.suffix_len;
return switch (suffix_len) {
1 => basename[basename.len - 1] == @as(u8, @truncate(pattern.suffix_u32)),
2 => blk: {
const tail_ptr = basename.ptr + basename.len - 2;
const tail: u16 = @as(*align(1) const u16, @ptrCast(tail_ptr)).*;
break :blk tail == @as(u16, @truncate(pattern.suffix_u32));
},
3 => blk: {
const tail_ptr = basename.ptr + basename.len - 3;
const tail_u16: u16 = @as(*align(1) const u16, @ptrCast(tail_ptr)).*;
const suffix_u16: u16 = @truncate(pattern.suffix_u32);
break :blk tail_u16 == suffix_u16 and tail_ptr[2] == pattern.suffix.?[2];
},
4 => blk: {
const tail_ptr = basename.ptr + basename.len - 4;
const tail: u32 = @as(*align(1) const u32, @ptrCast(tail_ptr)).*;
break :blk tail == pattern.suffix_u32;
},
else => mem.endsWith(u8, basename, pattern.suffix.?),
};
}
// Literal patterns
if (pattern.is_literal) {
return mem.eql(u8, text, basename);
}
// Complex wildcard patterns (*.o.*, .*, etc)
return glob.fnmatch.fnmatch(text, basename, .{});
}
// Non-anchored patterns with / - match full path
// For directory patterns, also match paths inside the directory
if (pattern.dir_only) {
if (path_matcher.matchGlobSimple(text, path)) return true;
// Check if any path component matches and this is a child path
// e.g., pattern "target" with dir_only should match "foo/target/bar.rs"
var start: usize = 0;
while (start < path.len) {
const end = mem.indexOfPos(u8, path, start, "/") orelse path.len;
const component_path = path[0..end];
if (path_matcher.matchGlobSimple(text, component_path)) {
// This path component matches, so any path starting with it is inside
return true;
}
if (end >= path.len) break;
start = end + 1;
}
return false;
}
return path_matcher.matchGlobSimple(text, path);
}
/// Check if a directory should be skipped entirely (not traversed)
/// This is called for every directory during traversal, so it must be fast.
/// We only skip directories that are DEFINITELY ignored with no possibility
/// of negation patterns affecting their children.
pub fn shouldSkipDirectory(self: *Self, dir_path: []const u8) bool {
const normalized_path = if (dir_path.len > 2 and dir_path[0] == '.' and dir_path[1] == '/') dir_path[2..] else dir_path;
// Check cache first - this is critical for performance
if (self.dir_cache.get(normalized_path)) |cached| {
return cached;
}
const basename = if (glob.lastIndexOfCharSIMD(normalized_path, '/')) |pos|
normalized_path[pos + 1 ..]
else
normalized_path;
// FAST PATH: Check literal directory patterns (O(1) lookup)
// Common patterns like "node_modules/", "target/", ".git/"
if (self.literal_dirs.get(basename)) |match| {
if (!match.negated) {
// Check if any negation could affect this directory or its children
if (!self.has_negations) {
// No negations at all - safe to skip
self.cacheResult(normalized_path, true);
return true;
}
// Has negations - check if any could affect this directory
// A negation can affect this directory if:
// 1. It directly re-includes this directory path
// 2. It re-includes something under this directory (starts with our path + /)
var dominated_by_negation = false;
for (self.patterns) |pattern| {
if (!pattern.negated) continue;
// Check if negation could affect this dir or its children
if (pattern.has_double_star) {
// ** negation could match anywhere - must be conservative
dominated_by_negation = true;
break;
}
// Check if negation pattern matches or is under our path
if (mem.startsWith(u8, pattern.text, normalized_path)) {
dominated_by_negation = true;
break;
}
if (mem.startsWith(u8, pattern.text, basename)) {
dominated_by_negation = true;
break;
}
}
if (!dominated_by_negation) {
self.cacheResult(normalized_path, true);
return true;
}
}
}
// Check anchored directory patterns (e.g., "rust/target/", "src/build/")
// These are stored in wildcard_patterns because they contain /
for (self.wildcard_patterns) |pattern| {
// Only check directory patterns that are anchored and literal
if (!pattern.dir_only) continue;
if (!pattern.anchored) continue;
if (!pattern.is_literal) continue;
if (pattern.negated) continue;
// For anchored literal directory patterns, check exact match
if (mem.eql(u8, pattern.text, normalized_path)) {
// Check if any negation could affect this directory or its children
if (!self.has_negations) {
self.cacheResult(normalized_path, true);
return true;
}
// Check for dominating negations
var dominated_by_negation = false;
for (self.patterns) |neg_pattern| {
if (!neg_pattern.negated) continue;
if (neg_pattern.has_double_star) {
dominated_by_negation = true;
break;
}
// Check if negation pattern could affect this dir or its children
if (mem.startsWith(u8, neg_pattern.text, normalized_path)) {
dominated_by_negation = true;
break;
}
}
if (!dominated_by_negation) {
self.cacheResult(normalized_path, true);
return true;
}
}
}
// For non-literal patterns or when negations might interfere,
// we need to be conservative and NOT skip
self.cacheResult(normalized_path, false);
return false;
}
/// Cache a directory skip result - duplicates the key since it may be from a stack buffer
fn cacheResult(self: *Self, path: []const u8, should_skip: bool) void {
// Duplicate the key since it may point to a temporary stack buffer
const key_copy = self.allocator.dupe(u8, path) catch return;
self.dir_cache.put(key_copy, should_skip) catch {
self.allocator.free(key_copy);
};
}
pub fn deinit(self: *Self) void {
// Free suffix pattern slices
var suffix_iter = self.suffix_patterns.valueIterator();
while (suffix_iter.next()) |slice| {
self.allocator.free(slice.*);
}
self.suffix_patterns.deinit();
// Free duplicated keys in dir_cache
var cache_iter = self.dir_cache.keyIterator();
while (cache_iter.next()) |key| {
self.allocator.free(key.*);
}
self.dir_cache.deinit();
self.literal_dirs.deinit();
self.literal_files.deinit();
self.allocator.free(self.patterns);
self.allocator.free(self.wildcard_patterns);
self.allocator.free(self.source);
}
};
// Tests
test "parse empty content" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator, "");
defer gi.deinit();
try std.testing.expectEqual(@as(usize, 0), gi.patterns.len);
}
test "parse comments and blank lines" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\# This is a comment
\\
\\# Another comment
\\
);
defer gi.deinit();
try std.testing.expectEqual(@as(usize, 0), gi.patterns.len);
}
test "parse simple patterns" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\*.log
\\build/
\\!important.log
);
defer gi.deinit();
try std.testing.expectEqual(@as(usize, 3), gi.patterns.len);
try std.testing.expectEqualStrings("*.log", gi.patterns[0].text);
try std.testing.expect(!gi.patterns[0].negated);
try std.testing.expectEqualStrings("build", gi.patterns[1].text);
try std.testing.expect(gi.patterns[1].dir_only);
try std.testing.expectEqualStrings("important.log", gi.patterns[2].text);
try std.testing.expect(gi.patterns[2].negated);
}
test "parse anchored patterns" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\/root.txt
\\src/temp
);
defer gi.deinit();
try std.testing.expectEqual(@as(usize, 2), gi.patterns.len);
try std.testing.expectEqualStrings("root.txt", gi.patterns[0].text);
try std.testing.expect(gi.patterns[0].anchored);
try std.testing.expectEqualStrings("src/temp", gi.patterns[1].text);
try std.testing.expect(gi.patterns[1].anchored);
}
test "isIgnored basic patterns" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\*.log
\\build/
);
defer gi.deinit();
try std.testing.expect(gi.isIgnored("test.log", false));
try std.testing.expect(gi.isIgnored("src/debug.log", false));
try std.testing.expect(!gi.isIgnored("test.txt", false));
try std.testing.expect(gi.isIgnored("build", true));
try std.testing.expect(!gi.isIgnored("build", false));
}
test "isIgnored with negation" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\*.log
\\!important.log
);
defer gi.deinit();
try std.testing.expect(gi.isIgnored("test.log", false));
try std.testing.expect(!gi.isIgnored("important.log", false));
}
test "isIgnored double star" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\**/logs
\\src/**/*.o
);
defer gi.deinit();
try std.testing.expect(gi.isIgnored("logs", true));
try std.testing.expect(gi.isIgnored("app/logs", true));
try std.testing.expect(gi.isIgnored("src/main.o", false));
try std.testing.expect(gi.isIgnored("src/lib/util.o", false));
try std.testing.expect(!gi.isIgnored("other/main.o", false));
}
test "shouldSkipDirectory" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\node_modules/
\\build/
\\!build/keep/
);
defer gi.deinit();
try std.testing.expect(gi.shouldSkipDirectory("node_modules"));
try std.testing.expect(!gi.shouldSkipDirectory("build"));
}
test "literal pattern optimization" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\target/
\\node_modules/
\\.git/
);
defer gi.deinit();
// These should use O(1) hash lookup
try std.testing.expect(gi.isIgnored("target", true));
try std.testing.expect(gi.isIgnored("node_modules", true));
try std.testing.expect(gi.isIgnored(".git", true));
try std.testing.expect(gi.isIgnored("foo/target", true));
try std.testing.expect(!gi.isIgnored("target", false)); // dir_only
}
test "suffix pattern optimization" {
const allocator = std.testing.allocator;
var gi = try GitIgnore.parse(allocator,
\\*.rs
\\*.log
);
defer gi.deinit();
try std.testing.expect(gi.isIgnored("main.rs", false));
try std.testing.expect(gi.isIgnored("src/lib.rs", false));
try std.testing.expect(gi.isIgnored("debug.log", false));
try std.testing.expect(!gi.isIgnored("main.txt", false));
}