📚

boost.regex を C#から呼び出す

2021/02/21に公開

C# で正規表現を利用する場合、通常はSystem.Text.RegularExpressions.Regexクラスを使用しますが、.NET 5.0の時点で入力にstringしか受け付けないので使い勝手が悪いです。

この記事では、C++の正規表現エンジンである boost.regex を使用してみます。
やることは単純にP/InvokeでPOSIX APIを呼び出すだけです。

前提:

  • Windows環境
  • .NET Core 3.1
  • <LangVersion>9.0</LangVersion>
  • <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
  • boost.regexのバージョンは1.72

boost.regex

C/C++のライブラリをC++/CLIでラップしてもいいですが、
boost.regexにはPOSIX 互換インターフェースがあるので、これがあればC#から正規表現を使用することができます。

インターフェースには、ASCII版の-A系関数と、UNICODE(UTF-16)版の-W関数があります。

BOOST_REGEX_DECL int BOOST_REGEX_CCALL regcompA(regex_tA*, const char*, int);
BOOST_REGEX_DECL regsize_t BOOST_REGEX_CCALL regerrorA(int, const regex_tA*, char*, regsize_t);
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regexecA(const regex_tA*, const char*, regsize_t, regmatch_t*, int);
BOOST_REGEX_DECL void BOOST_REGEX_CCALL regfreeA(regex_tA*);

#ifndef BOOST_NO_WREGEX
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regcompW(regex_tW*, const wchar_t*, int);
BOOST_REGEX_DECL regsize_t BOOST_REGEX_CCALL regerrorW(int, const regex_tW*, wchar_t*, regsize_t);
BOOST_REGEX_DECL int BOOST_REGEX_CCALL regexecW(const regex_tW*, const wchar_t*, regsize_t, regmatch_t*, int);
BOOST_REGEX_DECL void BOOST_REGEX_CCALL regfreeW(regex_tW*);
#endif

ネイティブのライブラリを使用する場合、自分でライブラリをビルドするのは敷居が高いのですがboost.regexは幸いなことにビルド済みパッケージがNugetで配布されています。

boost_regex-vc142

ただしC++プロジェクト向けなので、C#のプロジェクトで参照してもDLLはプロジェクトには配置されません。
ソリューションに空のC++プロジェクトを追加すれば、packages の中にDLLが展開されます。
(もしくは、ローカルのNugetキャッシュや直接.nupkgをダウンロードするなど)

...
packages\boost_regex-vc142.1.72.0.0\lib\native のディレクトリ

2021/02/20  13:37    <DIR>          .
2021/02/20  13:37    <DIR>          ..
2020/04/18  21:45         1,665,024 boost_regex-vc142-mt-gd-x32-1_72.dll
2020/04/18  21:45         1,190,262 boost_regex-vc142-mt-gd-x32-1_72.lib
2020/04/18  21:45         2,061,824 boost_regex-vc142-mt-gd-x64-1_72.dll
2020/04/18  21:45         1,202,468 boost_regex-vc142-mt-gd-x64-1_72.lib
2020/04/18  21:45           683,008 boost_regex-vc142-mt-x32-1_72.dll
2020/04/18  21:45         1,186,870 boost_regex-vc142-mt-x32-1_72.lib
2020/04/18  21:45           793,088 boost_regex-vc142-mt-x64-1_72.dll
2020/04/18  21:45         1,199,086 boost_regex-vc142-mt-x64-1_72.lib
...

gdはデバッグシンボル付のはずなので、 boost_regex-vc142-mt-x32-1_72.dll または boost_regex-vc142-mt-x64-1_72.dll を使います。

C#から呼び出す

UNICODE版を使用してもRegexクラスと差がないので、ASCII版の関数を使います。

boost.regexの中身は見てませんが、おそらくアクティブなコードページで動作すると思われますが、ここではASCII文字の範囲を使います。
マルチバイト文字セット(要はShift-JIS)での動作は見ません。

P/Invokeの関数を用意しましょう。
引数はポインターじゃなくてref/readonly refにしました。


    internal static class NativeMethods
    {
        private const string dllName = "boost_regex-vc142-mt-x64-1_72";

        [DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern reg_error_t regcompA(ref regex_tA preg, in byte pattern, reg_comp_flags cflags);

        [DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, ref byte errbuf, nuint errbuf_size);

        [DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, [Out] StringBuilder errbuf, nuint errbuf_size);

        [DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern reg_error_t regexecA(in regex_tA preg, in byte str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);

        [DllImport(dllName, CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern void regfreeA(ref regex_tA preg);


        [DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern reg_error_t regcompW(ref regex_tW preg, in char pattern, reg_comp_flags cflags);

        [DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, ref char errbuf, nuint errbuf_size);

        [DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, [Out] StringBuilder errbuf, nuint errbuf_size);

        [DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern reg_error_t regexecW(in regex_tW preg, in char str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);

        [DllImport(dllName, CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern void regfreeW(ref regex_tW preg);
    }

ライブラリー名はその使用していますが、ビルド時に変える仕組みにするか、NativeLibrary.SetDllImportResolverで実行時にいい感じに解決するのがいいでしょう。

あとはCと同じ感覚で呼び出すだけです。
ただし、入力はNUL文字終端が必要です。

manpageにあるサンプルを移植したコードです。

using System;
using System.Runtime.InteropServices;
using System.Text;
using static BoostRegex.NativeMethods;

namespace BoostRegex
{
    static class Program
    {
        static void Main(string[] args)
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            regex_tA preg = default;
            var str = Encoding.Default.GetBytes("1) John Driverhacker;\n2) John Doe;\n3) John Foo;\n\0");
            var pattern = Encoding.Default.GetBytes("John.*o\0");
            var matches = new regmatch_t[1];

            reg_error_t rc = regcompA(ref preg, in pattern[0], reg_comp_flags.REG_NEWLINE);
            if (rc != 0)
            {
                Console.WriteLine($"regcompA() failed. ({rc})");
                return;
            }

            nint pos = 0;

            for (var i = 0; ; i++)
            {
                rc = regexecA(in preg, in str[pos], (uint)matches.Length, ref matches[0], 0);
                if (rc != 0)
                {
                    Console.WriteLine($"regexecA() failed.({rc})");
                    break;
                }

                nint off = matches[0].rm_so + pos;
                nint len = matches[0].rm_eo - matches[0].rm_so;

                Console.WriteLine($"#{i}:");

                var match = matches[0];
                if (match.rm_so == -1 || match.rm_eo == -1)
                    break;

                Console.WriteLine($"offset = {off}; length = {len}");
                Console.WriteLine($"substring = \"{GetAnsiString(str.AsSpan((int)(pos + match.rm_so), (int)len))}\"");

                pos += match.rm_eo;
            }

            regfreeA(ref preg);
        }

        static string GetAnsiString(ReadOnlySpan<byte> str) => Encoding.Default.GetString(str);
    }
}

regex_tはラッパークラスを用意したほうがいいでしょう。

全文は折りたたんでます。
BoostRegex.cs
using System;
using System.Runtime.InteropServices;
using System.Text;
using static BoostRegex.NativeMethods;

namespace BoostRegex
{
    static class Program
    {
        static void Main(string[] args)
        {
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
            regex_tA preg = default;
            var str = Encoding.Default.GetBytes("1) John Driverhacker;\n2) John Doe;\n3) John Foo;\n\0");
            var pattern = Encoding.Default.GetBytes("John.*o\0");
            reg_error_t rc;
            var matches = new regmatch_t[1];

            if (0 != (rc = regcompA(ref preg, in pattern[0], reg_comp_flags.REG_NEWLINE)))
            {
                Console.WriteLine($"regcompA() failed. ({rc})");
                return;
            }

            nint pos = 0;

            for (var i = 0; ; i++)
            {
                if (0 != (rc = regexecA(in preg, in str[pos], (uint)matches.Length, ref matches[0], 0)))
                {
                    Console.WriteLine($"regexecA() failed.({rc})");
                    break;
                }

                nint off = matches[0].rm_so + pos;
                nint len = matches[0].rm_eo - matches[0].rm_so;

                Console.WriteLine($"#{i}:");

                var match = matches[0];
                if (match.rm_so == -1 || match.rm_eo == -1)
                    break;

                Console.WriteLine($"offset = {off}; length = {len}");
                Console.WriteLine(($"substring = \"{GetAnsiString(str.AsSpan((int)(pos + match.rm_so), (int)len))}\""));

                pos += match.rm_eo;
            }

            regfreeA(ref preg);
        }

        static string GetAnsiString(ReadOnlySpan<byte> str) => Encoding.Default.GetString(str);
    }

#pragma warning disable IDE1006 // 命名スタイル
    [Flags]
    public enum match_flag_type
    {
        match_default = 0,
        match_not_bol = 1,                                /* first is not start of line */
        match_not_eol = match_not_bol << 1,               /* last is not end of line */
        match_not_bob = match_not_eol << 1,               /* first is not start of buffer */
        match_not_eob = match_not_bob << 1,               /* last is not end of buffer */
        match_not_bow = match_not_eob << 1,               /* first is not start of word */
        match_not_eow = match_not_bow << 1,               /* last is not end of word */
        match_not_dot_newline = match_not_eow << 1,       /* \n is not matched by '.' */
        match_not_dot_null = match_not_dot_newline << 1,  /* '\0' is not matched by '.' */
        match_prev_avail = match_not_dot_null << 1,       /* *--first is a valid expression */
        match_init = match_prev_avail << 1,               /* internal use */
        match_any = match_init << 1,                      /* don't care what we match */
        match_not_null = match_any << 1,                  /* string can't be null */
        match_continuous = match_not_null << 1,           /* each grep match must continue from */
        /* uninterupted from the previous one */
        match_partial = match_continuous << 1,            /* find partial matches */

        match_stop = match_partial << 1,                  /* stop after first match (grep) V3 only */
        match_not_initial_null = match_stop,              /* don't match initial null, V4 only */
        match_all = match_stop << 1,                      /* must find the whole of input even if match_any is set */
        match_perl = match_all << 1,                      /* Use perl matching rules */
        match_posix = match_perl << 1,                    /* Use POSIX matching rules */
        match_nosubs = match_posix << 1,                  /* don't trap marked subs */
        match_extra = match_nosubs << 1,                  /* include full capture information for repeated captures */
        match_single_line = match_extra << 1,             /* treat text as single line and ignor any \n's when matching ^ and $. */
        match_unused1 = match_single_line << 1,           /* unused */
        match_unused2 = match_unused1 << 1,               /* unused */
        match_unused3 = match_unused2 << 1,               /* unused */
        match_max = match_unused3,

        format_perl = 0,                                  /* perl style replacement */
        format_default = 0,                               /* ditto. */
        format_sed = match_max << 1,                      /* sed style replacement. */
        format_all = format_sed << 1,                     /* enable all extentions to sytax. */
        format_no_copy = format_all << 1,                 /* don't copy non-matching segments. */
        format_first_only = format_no_copy << 1,          /* Only replace first occurance. */
        format_is_if = format_first_only << 1,            /* internal use only. */
        format_literal = format_is_if << 1,               /* treat string as a literal */

        match_not_any = match_not_bol | match_not_eol | match_not_bob
           | match_not_eob | match_not_bow | match_not_eow | match_not_dot_newline
           | match_not_dot_null | match_prev_avail | match_init | match_not_null
           | match_continuous | match_partial | match_stop | match_not_initial_null
           | match_stop | match_all | match_perl | match_posix | match_nosubs
           | match_extra | match_single_line | match_unused1 | match_unused2
           | match_unused3 | match_max | format_perl | format_default | format_sed
           | format_all | format_no_copy | format_first_only | format_is_if
           | format_literal


    }

    [StructLayout(LayoutKind.Sequential)]
    internal unsafe struct regex_tA
    {
        private uint re_magic;
        public nuint re_nsub;         /* number of parenthesized subexpressions */
        public byte* re_endp;       /* end pointer for REG_PEND */
        private IntPtr guts;                   /* none of your business :-) */
        private match_flag_type eflags;           /* none of your business :-) */
    }

    [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
    internal unsafe struct regex_tW
    {
        private uint re_magic;
        public nuint re_nsub;         /* number of parenthesized subexpressions */
        public char* re_endp;       /* end pointer for REG_PEND */
        private IntPtr guts;                   /* none of your business :-) */
        private match_flag_type eflags;           /* none of your business :-) */
    }

    internal readonly struct regmatch_t
    {
        public readonly nint rm_so;      /* start of match */
        public readonly nint rm_eo;      /* end of match */
    }
    [Flags]
    internal enum reg_comp_flags
    {
        REG_BASIC = 0000,
        REG_EXTENDED = 0x001,
        REG_ICASE = 0x002,
        REG_NOSUB = 0x004,
        REG_NEWLINE = 0x008,//0010,
        REG_NOSPEC = 0x010,//0020,
        REG_PEND = 0x020,//0040,
        REG_DUMP = 0x080,//0200,
        REG_NOCOLLATE = 0x100,//0400,
        REG_ESCAPE_IN_LISTS = 0x200,//01000,
        REG_NEWLINE_ALT = 0x400,//02000,
        REG_PERLEX = 0x800,//04000,

        REG_PERL = REG_EXTENDED | REG_NOCOLLATE | REG_ESCAPE_IN_LISTS | REG_PERLEX,
        REG_AWK = REG_EXTENDED | REG_ESCAPE_IN_LISTS,
        REG_GREP = REG_BASIC | REG_NEWLINE_ALT,
        REG_EGREP = REG_EXTENDED | REG_NEWLINE_ALT,

        REG_ASSERT = 15,
        REG_INVARG = 16,
        REG_ATOI = 255,   /* convert name to number (!) */
        REG_ITOA = 0x100//0400   /* convert number to name (!) */
    }

    [Flags]
    enum reg_exec_flags
    {
        None = 0,
        REG_NOTBOL = 0x0001,
        REG_NOTEOL = 0x0002,
        REG_STARTEND = 0x0004
    }

    /*
     * POSIX error codes:
     */
    enum reg_error_t : uint
    {
        REG_NOERROR = 0,   /* Success.  */
        REG_NOMATCH = 1,   /* Didn't find a match (for regexec).  */

        /* POSIX regcomp return error codes.  (In the order listed in the
           standard.)  */
        REG_BADPAT = 2,    /* Invalid pattern.  */
        REG_ECOLLATE = 3,  /* Undefined collating element.  */
        REG_ECTYPE = 4,    /* Invalid character class name.  */
        REG_EESCAPE = 5,   /* Trailing backslash.  */
        REG_ESUBREG = 6,   /* Invalid back reference.  */
        REG_EBRACK = 7,    /* Unmatched left bracket.  */
        REG_EPAREN = 8,    /* Parenthesis imbalance.  */
        REG_EBRACE = 9,    /* Unmatched \{.  */
        REG_BADBR = 10,    /* Invalid contents of \{\}.  */
        REG_ERANGE = 11,   /* Invalid range end.  */
        REG_ESPACE = 12,   /* Ran out of memory.  */
        REG_BADRPT = 13,   /* No preceding re for repetition op.  */
        REG_EEND = 14,     /* unexpected end of expression */
        REG_ESIZE = 15,    /* expression too big */
        REG_ERPAREN = 8,   /* = REG_EPAREN : unmatched right parenthesis */
        REG_EMPTY = 17,    /* empty expression */
        REG_E_MEMORY = 15, /* = REG_ESIZE : out of memory */
        REG_ECOMPLEXITY = 18, /* complexity too high */
        REG_ESTACK = 19,   /* out of stack space */
        REG_E_PERL = 20,   /* Perl (?...) error */
        REG_E_UNKNOWN = 21, /* unknown error */
        REG_ENOSYS = 21,   /* = REG_E_UNKNOWN : Reserved. */
    }

#pragma warning restore IDE1006 // 命名スタイル

    internal static class NativeMethods
    {
        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern reg_error_t regcompA(ref regex_tA preg, in byte pattern, reg_comp_flags cflags);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, ref byte errbuf, nuint errbuf_size);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern nint regerrorA(reg_error_t errcode, in regex_tA preg, [Out] StringBuilder errbuf, nuint errbuf_size);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern reg_error_t regexecA(in regex_tA preg, in byte str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Ansi, ExactSpelling = true)]
        public static extern void regfreeA(ref regex_tA preg);


        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern reg_error_t regcompW(ref regex_tW preg, in char pattern, reg_comp_flags cflags);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, ref char errbuf, nuint errbuf_size);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern nint regerrorW(reg_error_t errcode, in regex_tW preg, [Out] StringBuilder errbuf, nuint errbuf_size);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern reg_error_t regexecW(in regex_tW preg, in char str, nuint nmatch, ref regmatch_t pmatch, reg_exec_flags eflags);

        [DllImport("boost_regex-vc142-mt-x64-1_72", CharSet = CharSet.Unicode, ExactSpelling = true)]
        public static extern void regfreeW(ref regex_tW preg);
    }

    internal sealed class RegexException : Exception
    {
        readonly reg_error_t errcode;

        private RegexException()
        {
        }

        public RegexException(reg_error_t errcode, string message) : base(message)
        {
            this.errcode = errcode;
        }

        private RegexException(string message, Exception innerException) : base(message, innerException)
        {
        }

    }

    internal sealed class RegexA : IDisposable
    {
        private regex_tA reg = default;
        private bool _DisposedValue;

        public bool TryCompile(ReadOnlySpan<byte> pattern, out reg_error_t errcode, reg_comp_flags flags = reg_comp_flags.REG_BASIC)
        {
            System.Diagnostics.Debug.Assert(pattern.IndexOf((byte)0) != -1);

            errcode = NativeMethods.regcompA(ref reg, in pattern[0], flags);
            return errcode == 0;
        }

        public void Compile(ReadOnlySpan<byte> pattern, reg_comp_flags flags = reg_comp_flags.REG_BASIC)
        {
            if (!TryCompile(pattern, out var errcode, flags))
            {
                throw new RegexException(errcode, GetErrorMessage(errcode));
            }
        }

        public void Excec(ReadOnlySpan<byte> str, Span<regmatch_t> matches, reg_exec_flags flags = 0)
        {
            System.Diagnostics.Debug.Assert(str.IndexOf((byte)0) != -1);

            var errcode = NativeMethods.regexecA(in reg, in str[0], (uint)matches.Length, ref matches[0], flags);
            if (errcode != 0)
            {
                throw new RegexException(errcode, GetErrorMessage(errcode));
            }
        }

        string GetErrorMessage(reg_error_t errcode)
        {
            StringBuilder sb = new(256);
            sb.Length = (int)NativeMethods.regerrorA(errcode, in reg, sb, (uint)sb.Capacity);
            return sb.ToString();
        }

        private void Dispose(bool disposing)
        {
            if (!_DisposedValue)
            {
                NativeMethods.regfreeA(ref reg);
                _DisposedValue = true;
            }
        }

        ~RegexA()
        {
            Dispose(disposing: false);
        }

        public void Dispose()
        {
            Dispose(disposing: true);
            GC.SuppressFinalize(this);
        }
    }
}

Discussion