boost.regex を C#から呼び出す
C# で正規表現を利用する場合、通常はSystem.Text.RegularExpressions.Regex
クラスを使用しますが、.NET 5.0の時点で入力にstring
しか受け付けないので使い勝手が悪いです。
この記事では、C++の正規表現エンジンである boost.regex を使用してみます。
やることは単純にP/InvokeでPOSIX APIを呼び出すだけです。
前提:
- Windows環境
- .NET Core 3.1
<LangVersion>9.0</LangVersion>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
- boost.regexのバージョンは1.72
boost.regex
C/C++のライブラリをC++/CLIでラップしてもいいですが、
boost.regexにはPOSIX 互換インターフェースがあるので、これがあればC#から正規表現を使用することができます。
インターフェースには、ASCII版の-A系関数と、UNICODE(UTF-16)版の-W関数があります。
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regcompA(regex_tA*, const char*, int);
BOOST_REGEX_DECL regsize_t BOOST_REGEX_CCALL regerrorA(int, const regex_tA*, char*, regsize_t);
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regexecA(const regex_tA*, const char*, regsize_t, regmatch_t*, int);
BOOST_REGEX_DECL void BOOST_REGEX_CCALL regfreeA(regex_tA*);
#ifndef BOOST_NO_WREGEX
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regcompW(regex_tW*, const wchar_t*, int);
BOOST_REGEX_DECL regsize_t BOOST_REGEX_CCALL regerrorW(int, const regex_tW*, wchar_t*, regsize_t);
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regexecW(const regex_tW*, const wchar_t*, regsize_t, regmatch_t*, int);
BOOST_REGEX_DECL void BOOST_REGEX_CCALL regfreeW(regex_tW*);
#endif
ネイティブのライブラリを使用する場合、自分でライブラリをビルドするのは敷居が高いのですがboost.regexは幸いなことにビルド済みパッケージがNugetで配布されています。
ただしC++プロジェクト向けなので、C#のプロジェクトで参照してもDLLはプロジェクトには配置されません。
ソリューションに空のC++プロジェクトを追加すれば、packages
の中にDLLが展開されます。
(もしくは、ローカルのNugetキャッシュや直接.nupkgをダウンロードするなど)
...
packages\boost_regex-vc142.1.72.0.0\lib\native のディレクトリ
2021/02/20 13:37 <DIR> .
2021/02/20 13:37 <DIR> ..
2020/04/18 21:45 1,665,024 boost_regex-vc142-mt-gd-x32-1_72.dll
2020/04/18 21:45 1,190,262 boost_regex-vc142-mt-gd-x32-1_72.lib
2020/04/18 21:45 2,061,824 boost_regex-vc142-mt-gd-x64-1_72.dll
2020/04/18 21:45 1,202,468 boost_regex-vc142-mt-gd-x64-1_72.lib
2020/04/18 21:45 683,008 boost_regex-vc142-mt-x32-1_72.dll
2020/04/18 21:45 1,186,870 boost_regex-vc142-mt-x32-1_72.lib
2020/04/18 21:45 793,088 boost_regex-vc142-mt-x64-1_72.dll
2020/04/18 21:45 1,199,086 boost_regex-vc142-mt-x64-1_72.lib
...
gdはデバッグシンボル付のはずなので、 boost_regex-vc142-mt-x32-1_72.dll
または boost_regex-vc142-mt-x64-1_72.dll
を使います。
C#から呼び出す
UNICODE版を使用してもRegex
クラスと差がないので、ASCII版の関数を使います。
boost.regexの中身は見てませんが、おそらくアクティブなコードページで動作すると思われますが、ここではASCII文字の範囲を使います。
マルチバイト文字セット(要はShift-JIS)での動作は見ません。
P/Invokeの関数を用意しましょう。
引数はポインターじゃなくてref/readonly ref
にしました。
internal static class NativeMethods
{
private const string dllName = "boost_regex-vc142-mt-x64-1_72";
[DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern reg_error_t regcompA(ref regex_tA preg, in byte pattern, reg_comp_flags cflags);
[DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, ref byte errbuf, nuint errbuf_size);
[DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, [Out] StringBuilder errbuf, nuint errbuf_size);
[DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern reg_error_t regexecA(in regex_tA preg, in byte str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);
[DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern void regfreeA(ref regex_tA preg);
[DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern reg_error_t regcompW(ref regex_tW preg, in char pattern, reg_comp_flags cflags);
[DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, ref char errbuf, nuint errbuf_size);
[DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, [Out] StringBuilder errbuf, nuint errbuf_size);
[DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern reg_error_t regexecW(in regex_tW preg, in char str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);
[DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern void regfreeW(ref regex_tW preg);
}
ライブラリー名はその使用していますが、ビルド時に変える仕組みにするか、NativeLibrary.SetDllImportResolver
で実行時にいい感じに解決するのがいいでしょう。
あとはCと同じ感覚で呼び出すだけです。
ただし、入力はNUL文字終端が必要です。
manpageにあるサンプルを移植したコードです。
using System;
using System.Runtime.InteropServices;
using System.Text;
using static BoostRegex.NativeMethods;
namespace BoostRegex
{
static class Program
{
static void Main(string[] args)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
regex_tA preg = default;
var str = Encoding.Default.GetBytes("1) John Driverhacker;\n2) John Doe;\n3) John Foo;\n\0");
var pattern = Encoding.Default.GetBytes("John.*o\0");
var matches = new regmatch_t[1];
reg_error_t rc = regcompA(ref preg, in pattern[0], reg_comp_flags.REG_NEWLINE);
if (rc != 0)
{
Console.WriteLine($"regcompA() failed. ({rc})");
return;
}
nint pos = 0;
for (var i = 0; ; i++)
{
rc = regexecA(in preg, in str[pos], (uint)matches.Length, ref matches[0], 0);
if (rc != 0)
{
Console.WriteLine($"regexecA() failed.({rc})");
break;
}
nint off = matches[0].rm_so + pos;
nint len = matches[0].rm_eo - matches[0].rm_so;
Console.WriteLine($"#{i}:");
var match = matches[0];
if (match.rm_so == -1 || match.rm_eo == -1)
break;
Console.WriteLine($"offset = {off}; length = {len}");
Console.WriteLine($"substring = \"{GetAnsiString(str.AsSpan((int)(pos + match.rm_so), (int)len))}\"");
pos += match.rm_eo;
}
regfreeA(ref preg);
}
static string GetAnsiString(ReadOnlySpan<byte> str) => Encoding.Default.GetString(str);
}
}
regex_t
はラッパークラスを用意したほうがいいでしょう。
全文は折りたたんでます。
using System;
using System.Runtime.InteropServices;
using System.Text;
using static BoostRegex.NativeMethods;
namespace BoostRegex
{
static class Program
{
static void Main(string[] args)
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
regex_tA preg = default;
var str = Encoding.Default.GetBytes("1) John Driverhacker;\n2) John Doe;\n3) John Foo;\n\0");
var pattern = Encoding.Default.GetBytes("John.*o\0");
reg_error_t rc;
var matches = new regmatch_t[1];
if (0 != (rc = regcompA(ref preg, in pattern[0], reg_comp_flags.REG_NEWLINE)))
{
Console.WriteLine($"regcompA() failed. ({rc})");
return;
}
nint pos = 0;
for (var i = 0; ; i++)
{
if (0 != (rc = regexecA(in preg, in str[pos], (uint)matches.Length, ref matches[0], 0)))
{
Console.WriteLine($"regexecA() failed.({rc})");
break;
}
nint off = matches[0].rm_so + pos;
nint len = matches[0].rm_eo - matches[0].rm_so;
Console.WriteLine($"#{i}:");
var match = matches[0];
if (match.rm_so == -1 || match.rm_eo == -1)
break;
Console.WriteLine($"offset = {off}; length = {len}");
Console.WriteLine(($"substring = \"{GetAnsiString(str.AsSpan((int)(pos + match.rm_so), (int)len))}\""));
pos += match.rm_eo;
}
regfreeA(ref preg);
}
static string GetAnsiString(ReadOnlySpan<byte> str) => Encoding.Default.GetString(str);
}
#pragma warning disable IDE1006 // 命名スタイル
[Flags]
public enum match_flag_type
{
match_default = 0,
match_not_bol = 1, /* first is not start of line */
match_not_eol = match_not_bol << 1, /* last is not end of line */
match_not_bob = match_not_eol << 1, /* first is not start of buffer */
match_not_eob = match_not_bob << 1, /* last is not end of buffer */
match_not_bow = match_not_eob << 1, /* first is not start of word */
match_not_eow = match_not_bow << 1, /* last is not end of word */
match_not_dot_newline = match_not_eow << 1, /* \n is not matched by '.' */
match_not_dot_null = match_not_dot_newline << 1, /* '\0' is not matched by '.' */
match_prev_avail = match_not_dot_null << 1, /* *--first is a valid expression */
match_init = match_prev_avail << 1, /* internal use */
match_any = match_init << 1, /* don't care what we match */
match_not_null = match_any << 1, /* string can't be null */
match_continuous = match_not_null << 1, /* each grep match must continue from */
/* uninterupted from the previous one */
match_partial = match_continuous << 1, /* find partial matches */
match_stop = match_partial << 1, /* stop after first match (grep) V3 only */
match_not_initial_null = match_stop, /* don't match initial null, V4 only */
match_all = match_stop << 1, /* must find the whole of input even if match_any is set */
match_perl = match_all << 1, /* Use perl matching rules */
match_posix = match_perl << 1, /* Use POSIX matching rules */
match_nosubs = match_posix << 1, /* don't trap marked subs */
match_extra = match_nosubs << 1, /* include full capture information for repeated captures */
match_single_line = match_extra << 1, /* treat text as single line and ignor any \n's when matching ^ and $. */
match_unused1 = match_single_line << 1, /* unused */
match_unused2 = match_unused1 << 1, /* unused */
match_unused3 = match_unused2 << 1, /* unused */
match_max = match_unused3,
format_perl = 0, /* perl style replacement */
format_default = 0, /* ditto. */
format_sed = match_max << 1, /* sed style replacement. */
format_all = format_sed << 1, /* enable all extentions to sytax. */
format_no_copy = format_all << 1, /* don't copy non-matching segments. */
format_first_only = format_no_copy << 1, /* Only replace first occurance. */
format_is_if = format_first_only << 1, /* internal use only. */
format_literal = format_is_if << 1, /* treat string as a literal */
match_not_any = match_not_bol | match_not_eol | match_not_bob
| match_not_eob | match_not_bow | match_not_eow | match_not_dot_newline
| match_not_dot_null | match_prev_avail | match_init | match_not_null
| match_continuous | match_partial | match_stop | match_not_initial_null
| match_stop | match_all | match_perl | match_posix | match_nosubs
| match_extra | match_single_line | match_unused1 | match_unused2
| match_unused3 | match_max | format_perl | format_default | format_sed
| format_all | format_no_copy | format_first_only | format_is_if
| format_literal
}
[StructLayout(LayoutKind.Sequential)]
internal unsafe struct regex_tA
{
private uint re_magic;
public nuint re_nsub; /* number of parenthesized subexpressions */
public byte* re_endp; /* end pointer for REG_PEND */
private IntPtr guts; /* none of your business :-) */
private match_flag_type eflags; /* none of your business :-) */
}
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
internal unsafe struct regex_tW
{
private uint re_magic;
public nuint re_nsub; /* number of parenthesized subexpressions */
public char* re_endp; /* end pointer for REG_PEND */
private IntPtr guts; /* none of your business :-) */
private match_flag_type eflags; /* none of your business :-) */
}
internal readonly struct regmatch_t
{
public readonly nint rm_so; /* start of match */
public readonly nint rm_eo; /* end of match */
}
[Flags]
internal enum reg_comp_flags
{
REG_BASIC = 0000,
REG_EXTENDED = 0x001,
REG_ICASE = 0x002,
REG_NOSUB = 0x004,
REG_NEWLINE = 0x008,//0010,
REG_NOSPEC = 0x010,//0020,
REG_PEND = 0x020,//0040,
REG_DUMP = 0x080,//0200,
REG_NOCOLLATE = 0x100,//0400,
REG_ESCAPE_IN_LISTS = 0x200,//01000,
REG_NEWLINE_ALT = 0x400,//02000,
REG_PERLEX = 0x800,//04000,
REG_PERL = REG_EXTENDED | REG_NOCOLLATE | REG_ESCAPE_IN_LISTS | REG_PERLEX,
REG_AWK = REG_EXTENDED | REG_ESCAPE_IN_LISTS,
REG_GREP = REG_BASIC | REG_NEWLINE_ALT,
REG_EGREP = REG_EXTENDED | REG_NEWLINE_ALT,
REG_ASSERT = 15,
REG_INVARG = 16,
REG_ATOI = 255, /* convert name to number (!) */
REG_ITOA = 0x100//0400 /* convert number to name (!) */
}
[Flags]
enum reg_exec_flags
{
None = 0,
REG_NOTBOL = 0x0001,
REG_NOTEOL = 0x0002,
REG_STARTEND = 0x0004
}
/*
* POSIX error codes:
*/
enum reg_error_t : uint
{
REG_NOERROR = 0, /* Success. */
REG_NOMATCH = 1, /* Didn't find a match (for regexec). */
/* POSIX regcomp return error codes. (In the order listed in the
standard.) */
REG_BADPAT = 2, /* Invalid pattern. */
REG_ECOLLATE = 3, /* Undefined collating element. */
REG_ECTYPE = 4, /* Invalid character class name. */
REG_EESCAPE = 5, /* Trailing backslash. */
REG_ESUBREG = 6, /* Invalid back reference. */
REG_EBRACK = 7, /* Unmatched left bracket. */
REG_EPAREN = 8, /* Parenthesis imbalance. */
REG_EBRACE = 9, /* Unmatched \{. */
REG_BADBR = 10, /* Invalid contents of \{\}. */
REG_ERANGE = 11, /* Invalid range end. */
REG_ESPACE = 12, /* Ran out of memory. */
REG_BADRPT = 13, /* No preceding re for repetition op. */
REG_EEND = 14, /* unexpected end of expression */
REG_ESIZE = 15, /* expression too big */
REG_ERPAREN = 8, /* = REG_EPAREN : unmatched right parenthesis */
REG_EMPTY = 17, /* empty expression */
REG_E_MEMORY = 15, /* = REG_ESIZE : out of memory */
REG_ECOMPLEXITY = 18, /* complexity too high */
REG_ESTACK = 19, /* out of stack space */
REG_E_PERL = 20, /* Perl (?...) error */
REG_E_UNKNOWN = 21, /* unknown error */
REG_ENOSYS = 21, /* = REG_E_UNKNOWN : Reserved. */
}
#pragma warning restore IDE1006 // 命名スタイル
internal static class NativeMethods
{
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern reg_error_t regcompA(ref regex_tA preg, in byte pattern, reg_comp_flags cflags);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, ref byte errbuf, nuint errbuf_size);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, [Out] StringBuilder errbuf, nuint errbuf_size);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern reg_error_t regexecA(in regex_tA preg, in byte str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
public static extern void regfreeA(ref regex_tA preg);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern reg_error_t regcompW(ref regex_tW preg, in char pattern, reg_comp_flags cflags);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, ref char errbuf, nuint errbuf_size);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, [Out] StringBuilder errbuf, nuint errbuf_size);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern reg_error_t regexecW(in regex_tW preg, in char str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);
[DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
public static extern void regfreeW(ref regex_tW preg);
}
internal sealed class RegexException : Exception
{
readonly reg_error_t errcode;
private RegexException()
{
}
public RegexException(reg_error_t errcode, string message) : base(message)
{
this.errcode = errcode;
}
private RegexException(string message, Exception innerException) : base(message, innerException)
{
}
}
internal sealed class RegexA : IDisposable
{
private regex_tA reg = default;
private bool _DisposedValue;
public bool TryCompile(ReadOnlySpan<byte> pattern, out reg_error_t errcode, reg_comp_flags flags = reg_comp_flags.REG_BASIC)
{
System.Diagnostics.Debug.Assert(pattern.IndexOf((byte)0) != -1);
errcode = NativeMethods.regcompA(ref reg, in pattern[0], flags);
return errcode == 0;
}
public void Compile(ReadOnlySpan<byte> pattern, reg_comp_flags flags = reg_comp_flags.REG_BASIC)
{
if (!TryCompile(pattern, out var errcode, flags))
{
throw new RegexException(errcode, GetErrorMessage(errcode));
}
}
public void Excec(ReadOnlySpan<byte> str, Span<regmatch_t> matches, reg_exec_flags flags = 0)
{
System.Diagnostics.Debug.Assert(str.IndexOf((byte)0) != -1);
var errcode = NativeMethods.regexecA(in reg, in str[0], (uint)matches.Length, ref matches[0], flags);
if (errcode != 0)
{
throw new RegexException(errcode, GetErrorMessage(errcode));
}
}
string GetErrorMessage(reg_error_t errcode)
{
StringBuilder sb = new(256);
sb.Length = (int)NativeMethods.regerrorA(errcode, in reg, sb, (uint)sb.Capacity);
return sb.ToString();
}
private void Dispose(bool disposing)
{
if (!_DisposedValue)
{
NativeMethods.regfreeA(ref reg);
_DisposedValue = true;
}
}
~RegexA()
{
Dispose(disposing: false);
}
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}
}
}
Discussion