// zig wrapper for pcre2 regex library // license: this code is released in the public domain. Do what thou wilt. // 2023-03-04 17:30:51Z const std = @import("std"); const debug = std.debug; const c = @cImport({ @cDefine("PCRE2_CODE_UNIT_WIDTH", "8"); @cInclude("pcre2.h"); }); pub const Regex = struct { r: *c.pcre2_code_8, pub fn init(pattern: []const u8, options: CompileOptions) !Regex { var errnum: c_int = 0; var erroff: usize = 0; const nocontext = null; // https://PCRE2Project.github.io/pcre2/doc/html/pcre2_compile.html if(c.pcre2_compile_8( pattern.ptr, pattern.len, options.c_mask(), &errnum, &erroff, nocontext)) |re| { return Regex{.r=re}; }else{ return error.RegexCompile; } } pub fn deinit(re: Regex) void { c.pcre2_code_free_8(re.r); } pub fn match(re: Regex, grp: Groups, subject: []const u8, options: MatchOptions) !usize { const nocontext = null; const startoffset = 0; // https://PCRE2Project.github.io/pcre2/doc/html/pcre2_match.html const rc = c.pcre2_match_8( re.r, subject.ptr, subject.len, startoffset, options.c_mask(), grp.g, nocontext); if(rc<=0){ return switch(rc){ 0 => error.OutOfMemory, c.PCRE2_ERROR_NOMATCH => error.RegexNoMatch, c.PCRE2_ERROR_PARTIAL => error.RegexPartialMatch, else => error.RegexUnexpectedError, }; } const nmatches = @intCast(usize, rc); return nmatches; } }; pub const Groups = struct { g: *c.pcre2_match_data_8, pub fn init(re: Regex) !Groups { if(c.pcre2_match_data_create_from_pattern_8(re.r, null)) |g|{ return Groups{.g=g}; }else{ return error.OutOfMemory; } } pub fn deinit(grp: Groups) void { c.pcre2_match_data_free_8(grp.g); } pub fn count(grp: Groups) usize { return c.pcre2_get_ovector_count_8(grp.g); } pub fn nth(grp: Groups, n: usize, subject: []const u8) []const u8 { debug.assert(n < grp.count()); const vec = c.pcre2_get_ovector_pointer_8(grp.g); const start = vec[2*n]; const end = vec[2*n+1]; return subject[start..end]; } }; pub const CompileOptions = struct { Anchored: bool = false, AllowEmptyClass: bool = false, AltBsux: bool = false, AltCircumflex: bool = false, AltVerbnames: bool = false, AutoCallout: bool = false, Caseless: bool = false, DollarEndonly: bool = false, Dotall: bool = false, Dupnames: bool = false, Endanchored: bool = false, Extended: bool = false, Firstline: bool = false, Literal: bool = false, MatchInvalidUtf: bool = false, MatchUnsetBackref: bool = false, Multiline: bool = false, NeverBackslashC: bool = false, NeverUcp: bool = false, NeverUtf: bool = false, NoAutoCapture: bool = false, NoAutoPossess: bool = false, NoDotstarAnchor: bool = false, NoStartOptimize: bool = false, NoUtfCheck: bool = false, Ucp: bool = false, Ungreedy: bool = false, UseOffsetLimit: bool = false, Utf: bool = false, pub fn c_mask(o: CompileOptions) u32 { var m: u32 = 0; if(o.Anchored) m |= c.PCRE2_ANCHORED; if(o.AllowEmptyClass) m |= c.PCRE2_ALLOW_EMPTY_CLASS; if(o.AltBsux) m |= c.PCRE2_ALT_BSUX; if(o.AltCircumflex) m |= c.PCRE2_ALT_CIRCUMFLEX; if(o.AltVerbnames) m |= c.PCRE2_ALT_VERBNAMES; if(o.AutoCallout) m |= c.PCRE2_AUTO_CALLOUT; if(o.Caseless) m |= c.PCRE2_CASELESS; if(o.DollarEndonly) m |= c.PCRE2_DOLLAR_ENDONLY; if(o.Dotall) m |= c.PCRE2_DOTALL; if(o.Dupnames) m |= c.PCRE2_DUPNAMES; if(o.Endanchored) m |= c.PCRE2_ENDANCHORED; if(o.Extended) m |= c.PCRE2_EXTENDED; if(o.Firstline) m |= c.PCRE2_FIRSTLINE; if(o.Literal) m |= c.PCRE2_LITERAL; if(o.MatchInvalidUtf) m |= c.PCRE2_MATCH_INVALID_UTF; if(o.MatchUnsetBackref) m |= c.PCRE2_MATCH_UNSET_BACKREF; if(o.Multiline) m |= c.PCRE2_MULTILINE; if(o.NeverBackslashC) m |= c.PCRE2_NEVER_BACKSLASH_C; if(o.NeverUcp) m |= c.PCRE2_NEVER_UCP; if(o.NeverUtf) m |= c.PCRE2_NEVER_UTF; if(o.NoAutoCapture) m |= c.PCRE2_NO_AUTO_CAPTURE; if(o.NoAutoPossess) m |= c.PCRE2_NO_AUTO_POSSESS; if(o.NoDotstarAnchor) m |= c.PCRE2_NO_DOTSTAR_ANCHOR; if(o.NoStartOptimize) m |= c.PCRE2_NO_START_OPTIMIZE; if(o.NoUtfCheck) m |= c.PCRE2_NO_UTF_CHECK; if(o.Ucp) m |= c.PCRE2_UCP; if(o.Ungreedy) m |= c.PCRE2_UNGREEDY; if(o.UseOffsetLimit) m |= c.PCRE2_USE_OFFSET_LIMIT; if(o.Utf) m |= c.PCRE2_UTF; return m; } }; pub const MatchOptions = struct { Anchored: bool = false, CopyMatchedSubject: bool = false, Endanchored: bool = false, Notbol: bool = false, Noteol: bool = false, Notempty: bool = false, NotemptyAtstart: bool = false, NoJit: bool = false, NoUtfCheck: bool = false, PartialHard: bool = false, PartialSoft: bool = false, pub fn c_mask(o: MatchOptions) u32 { var m: u32 = 0; if(o.Anchored) m |= c. PCRE2_ANCHORED; if(o.CopyMatchedSubject) m |= c. PCRE2_COPY_MATCHED_SUBJECT; if(o.Endanchored) m |= c.PCRE2_ENDANCHORED; if(o.Notbol) m |= c. PCRE2_NOTBOL; if(o.Noteol) m |= c. PCRE2_NOTEOL; if(o.Notempty) m |= c. PCRE2_NOTEMPTY; if(o.NotemptyAtstart) m |= c.PCRE2_NOTEMPTY_ATSTART; if(o.NoJit) m |= c.PCRE2_NO_JIT; if(o.NoUtfCheck) m |= c. PCRE2_NO_UTF_CHECK; if(o.PartialHard) m |= c.PCRE2_PARTIAL_HARD; if(o.PartialSoft) m |= c.PCRE2_PARTIAL_SOFT; return m; } }; test "basic test" { const tt = std.testing; const nooptions = .{}; const re = try Regex.init("abc*", nooptions); defer re.deinit(); var grp = try Groups.init(re); defer grp.deinit(); const subject = "abcccc"; const nmatch = re.match(grp, subject, nooptions); try tt.expectEqual(nmatch,1); const ncap = grp.count(); try tt.expectEqual(nmatch, ncap); try tt.expectEqualStrings(subject, grp.nth(0, subject)); } test "group test" { const tt = std.testing; const nooptions = .{}; const subject = \\99.99.99.99 - - [23/Feb/2023:09:46:18 +0000] "POST /u/nandalism/inbox HTTP/1.1" 200 0 "-" "http.rb/4.4.1 (Mastodon/3.3.3; +https://pawoo.net/)" ; const re = try Regex.init( \\(\S+) - (\S+) \[([^]]+)\] "([^"]+)" (\S+) (\S+) "([^"]+)" "([^"]+)" , nooptions); defer re.deinit(); var grp = try Groups.init(re); defer grp.deinit(); const nmatch = re.match(grp, subject, nooptions); try tt.expectEqual(nmatch,9); const ncap = grp.count(); try tt.expectEqual(nmatch, ncap); const exstr = [_][]const u8{ subject, "99.99.99.99", "-", "23/Feb/2023:09:46:18 +0000", "POST /u/nandalism/inbox HTTP/1.1", "200", "0", "-", "http.rb/4.4.1 (Mastodon/3.3.3; +https://pawoo.net/)", }; var i: usize = 0; while(i