Closed7

できる限り RFC3986 に準拠した URI 抽出用正規表現

yuimaruyuimaru

忙しいですか?こちらです

/(?<scheme>[a-zA-Z]([a-zA-Z0-9+.-])*):(?<hier_part>\/\/((?<userinfo>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:)*@)?(?<host>\[(?<ipv6address>(((?<h16_0>([0-9a-fA-F]{1,4})):){6}(?<ls32_0>((?<h16_1>([0-9a-fA-F]{1,4})):(?<h16_2>([0-9a-fA-F]{1,4}))|(?<ipv4address_0>((?<dec_octet_0>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_1>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_2>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_3>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|::((?<h16_3>([0-9a-fA-F]{1,4})):){5}(?<ls32_1>((?<h16_4>([0-9a-fA-F]{1,4})):(?<h16_5>([0-9a-fA-F]{1,4}))|(?<ipv4address_1>((?<dec_octet_4>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_5>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_6>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_7>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|((?<h16_6>([0-9a-fA-F]{1,4})))?::((?<h16_7>([0-9a-fA-F]{1,4})):){4}(?<ls32_2>((?<h16_8>([0-9a-fA-F]{1,4})):(?<h16_9>([0-9a-fA-F]{1,4}))|(?<ipv4address_2>((?<dec_octet_8>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_9>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_10>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_11>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_10>([0-9a-fA-F]{1,4})):){0,1}(?<h16_11>([0-9a-fA-F]{1,4})))?::((?<h16_12>([0-9a-fA-F]{1,4})):){3}(?<ls32_3>((?<h16_13>([0-9a-fA-F]{1,4})):(?<h16_14>([0-9a-fA-F]{1,4}))|(?<ipv4address_3>((?<dec_octet_12>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_13>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_14>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_15>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_15>([0-9a-fA-F]{1,4})):){0,2}(?<h16_16>([0-9a-fA-F]{1,4})))?::((?<h16_17>([0-9a-fA-F]{1,4})):){2}(?<ls32_4>((?<h16_18>([0-9a-fA-F]{1,4})):(?<h16_19>([0-9a-fA-F]{1,4}))|(?<ipv4address_4>((?<dec_octet_16>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_17>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_18>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_19>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_20>([0-9a-fA-F]{1,4})):){0,3}(?<h16_21>([0-9a-fA-F]{1,4})))?::(?<h16_22>([0-9a-fA-F]{1,4})):(?<ls32_5>((?<h16_23>([0-9a-fA-F]{1,4})):(?<h16_24>([0-9a-fA-F]{1,4}))|(?<ipv4address_5>((?<dec_octet_20>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_21>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_22>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_23>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_25>([0-9a-fA-F]{1,4})):){0,4}(?<h16_26>([0-9a-fA-F]{1,4})))?::(?<ls32_6>((?<h16_27>([0-9a-fA-F]{1,4})):(?<h16_28>([0-9a-fA-F]{1,4}))|(?<ipv4address_6>((?<dec_octet_24>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_25>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_26>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_27>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_29>([0-9a-fA-F]{1,4})):){0,5}(?<h16_30>([0-9a-fA-F]{1,4})))?::(?<h16_31>([0-9a-fA-F]{1,4}))|(((?<h16_32>([0-9a-fA-F]{1,4})):){0,6}(?<h16_33>([0-9a-fA-F]{1,4})))?::))\]|(?<ipv4address_7>((?<dec_octet_28>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_29>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_30>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_31>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))|(?<reg_name>([a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=])*))(:(?<port>[0-9]*))?(?<path>(?<path_abempty>(\/(?<segment_path_abempty>((?<pchar_segment_path_abempty>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*)|(?<path_absolute>(\/(?<segment_nz_path_absolute>((?<pchar_segment_nz_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_absolute>((?<pchar_segment_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))+?))|(?<path_noscheme>((?<segment_nz_nc_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|@)(\/(?<segment_path_noscheme>((?<pchar_segment_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?<path_rootless>((?<segment_nz_path_rootless>((?<pchar_segment_nz_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_rootless>((?<pchar_segment_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?:)))(\?(?<query>((?<pchar_query>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?(#(?<fragment>((?<pchar_fragment>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?/
過去のもの

1

/(?<scheme>[a-zA-Z]([a-zA-Z0-9+.-])*):(?<hier_part>\/\/((?<userinfo>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:)*@)?(?<host>((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?<reg_name>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=])*)(:(?<port>[0-9]*))?(?<path>(?<path_abempty>\/(?<segment_path_abempty>(?<pchar_segment_path_abempty>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*)*|(?<path_absolute>\/(?<segment_nz_path_absolute>(?<pchar_segment_nz_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+(\/(?<segment_path_absolute>(?<pchar_segment_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*)+?)|(?<path_noscheme>(?<segment_nz_nc_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|@)(\/(?<segment_path_noscheme>(?<pchar_segment_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*)*)|(?<path_rootless>(?<segment_nz_path_rootless>(?<pchar_segment_nz_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+(\/(?<segment_path_rootless>(?<pchar_segment_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*)*)|(?:)))(\?(?<query>(?<pchar_query>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*)?(#(?<fragment>(?<pchar_fragment>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*)?/

2

/(?<scheme>[a-zA-Z]([a-zA-Z0-9+.-])*):(?<hier_part>\/\/((?<userinfo>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:)*@)?(?<host>((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?<reg_name>([a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=])*))(:(?<port>[0-9]*))?(?<path>(?<path_abempty>(\/(?<segment_path_abempty>((?<pchar_segment_path_abempty>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*)|(?<path_absolute>(\/(?<segment_nz_path_absolute>((?<pchar_segment_nz_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_absolute>((?<pchar_segment_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))+?))|(?<path_noscheme>((?<segment_nz_nc_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|@)(\/(?<segment_path_noscheme>((?<pchar_segment_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?<path_rootless>((?<segment_nz_path_rootless>((?<pchar_segment_nz_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_rootless>((?<pchar_segment_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?:)))(\?(?<query>((?<pchar_query>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?(#(?<fragment>((?<pchar_fragment>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?/

解説

文書によると URI は

      URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]

らしい

私は ABNF が読めないのでノリで生成しました.
生成に使ったコードが以下の通りです.

const source = (/** @type {any[]} */ ...r) => r.map(x => typeof x === "string" ? x : x instanceof RegExp ? x.source : String(x)).join("");
const group = (name, /** @type {(string | RegExp)[]} */ ...r) => {
  return "(" + (name ? "?<" + name + ">" : "") + r.map(x => source(x)).join("|") + ")";
}
const opt = (/** @type {string} */ r) => source(r) + "?";
export const gen_delims = new RegExp("[:/?#[]@]");
export const sub_delims = new RegExp("[!$&'()*+,;=]");
export const reserved = new RegExp(group("reserved", gen_delims, sub_delims));
export const unserved = new RegExp("[a-zA-Z0-9._~-]");
export const pct_encoded = new RegExp("%[0-9a-fA-F]{2}");

export const scheme = new RegExp(group("scheme", "[a-zA-Z]" + group("", "[a-zA-Z0-9+.-]") + "*"));

export const userinfo = new RegExp(group("userinfo", unserved, pct_encoded, sub_delims, ":") + "*");
export const reg_name = new RegExp(group("reg_name", unserved, pct_encoded, sub_delims) + "*");
export const ipv4 = /((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])/;
export const host = new RegExp(group("host", ipv4, reg_name));
export const port = new RegExp(group("port", "[0-9]*"));
export const authority = new RegExp(opt(group("", source(userinfo) + "@")) + source(host) + opt(group("", ":" + source(port))));

export const pchar = (/** @type {string} */ name) => new RegExp(group("pchar_" + name, unserved, pct_encoded, sub_delims, ":", "@"));
export const segment = (/** @type {string} */ name) => new RegExp(group("segment_" + name, pchar("segment_" + name)) + "*");
export const segment_nz = (/** @type {string} */ name) => new RegExp(group("segment_nz_" + name, pchar("segment_nz_" + name)) + "+");
export const segment_nz_nc = (/** @type {string} */ name) => new RegExp(group("segment_nz_nc_" + name, unserved, pct_encoded, sub_delims, "@"));

export const path_abempty = new RegExp(group("path_abempty", "/" + source(segment("path_abempty"))) + "*");
export const path_absolute = new RegExp(group("path_absolute", "/" + opt(source(segment_nz("path_absolute")) + group("", "/" + source(segment("path_absolute"))) + "+")));
export const path_noscheme = new RegExp(group("path_noscheme", source(segment_nz_nc("path_noscheme")) + group("", "/" + source(segment("path_noscheme"))) + "*"));
export const path_rootless = new RegExp(group("path_rootless", source(segment_nz("path_rootless")) + group("", "/" + source(segment("path_rootless"))) + "*"));
export const path = new RegExp(group("path", path_abempty, path_absolute, path_noscheme, path_rootless, "(?:)"));

export const query = new RegExp(group("query", pchar("query"), "/", "\\?") + "*");
export const fragment = new RegExp(group("fragment", pchar("fragment"), "/", "\\?") + "*");

export const hier_part = new RegExp(group("hier_part", "//" + source(authority) + source(path)));

export const absolute_uri = new RegExp(source(scheme) + ":" + source(hier_part) + opt(group("", "\\?" + query)));
export const uri = new RegExp(source(scheme) + ":" + source(hier_part) + opt(group("", "\\?" + source(query))) + opt(group("", "#" + source(fragment))));

まあひどい.
scheme の正規表現で redos 攻撃受けそうな感じあるけど諦めてる.

group がいっぱいついているのでホスト名を抽出したい時とか,結構嬉しいかも?

yuimaruyuimaru

改善できそうなところがあるので帰ったら書く

yuimaruyuimaru

なんか group の処理がおかしかったので修正.IPv4 アドレス以外の部分は多分これでよし.

const source = (/** @type {any[]} */ ...r) => r.map(x => typeof x === "string" ? x : x instanceof RegExp ? x.source : String(x)).join("");
const group = (name, /** @type {(string | RegExp)[]} */ ...r) => {
  return "(" + (name ? "?<" + name + ">" : "") + r.map(x => source(x)).join("|") + ")";
}
const opt = (/** @type {string} */ r) => source(r) + "?";
export const gen_delims = new RegExp("[:/?#[]@]");
export const sub_delims = new RegExp("[!$&'()*+,;=]");
export const reserved = new RegExp(group("reserved", gen_delims, sub_delims));
export const unserved = new RegExp("[a-zA-Z0-9._~-]");
export const pct_encoded = new RegExp("%[0-9a-fA-F]{2}");

export const scheme = new RegExp(group("scheme", "[a-zA-Z]" + group("", "[a-zA-Z0-9+.-]") + "*"));

export const userinfo = new RegExp(group("userinfo", unserved, pct_encoded, sub_delims, ":") + "*");
export const reg_name = new RegExp(group("reg_name", group("", unserved, pct_encoded, sub_delims) + "*"));
export const ipv4 = /((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])/;
export const host = new RegExp(group("host", ipv4, reg_name));
export const port = new RegExp(group("port", "[0-9]*"));
export const authority = new RegExp(opt(group("", source(userinfo) + "@")) + source(host) + opt(group("", ":" + source(port))));

export const pchar = (/** @type {string} */ name) => new RegExp(group("pchar_" + name, unserved, pct_encoded, sub_delims, ":", "@"));
export const segment = (/** @type {string} */ name) => new RegExp(group("segment_" + name, group("", pchar("segment_" + name)) + "*"));
export const segment_nz = (/** @type {string} */ name) => new RegExp(group("segment_nz_" + name, group("", pchar("segment_nz_" + name)) + "+"));
export const segment_nz_nc = (/** @type {string} */ name) => new RegExp(group("segment_nz_nc_" + name, unserved, pct_encoded, sub_delims, "@"));

export const path_abempty = new RegExp(group("path_abempty", group("", "/" + source(segment("path_abempty"))) + "*"));
export const path_absolute = new RegExp(group("path_absolute", group("", "/" + opt(source(segment_nz("path_absolute")) + group("", "/" + source(segment("path_absolute"))) + "+"))));
export const path_noscheme = new RegExp(group("path_noscheme", group("", source(segment_nz_nc("path_noscheme")) + group("", "/" + source(segment("path_noscheme"))) + "*")));
export const path_rootless = new RegExp(group("path_rootless", group("", source(segment_nz("path_rootless")) + group("", "/" + source(segment("path_rootless"))) + "*")));
export const path = new RegExp(group("path", path_abempty, path_absolute, path_noscheme, path_rootless, "(?:)"));

export const query = new RegExp(group("query", group("", pchar("query"), "/", "\\?") + "*"));
export const fragment = new RegExp(group("fragment", group("", pchar("fragment"), "/", "\\?") + "*"));

export const hier_part = new RegExp(group("hier_part", "//" + source(authority) + source(path)));

export const absolute_uri = new RegExp(source(scheme) + ":" + source(hier_part) + opt(group("", "\\?" + query)));
export const uri = new RegExp(source(scheme) + ":" + source(hier_part) + opt(group("", "\\?" + source(query))) + opt(group("", "#" + source(fragment))));
/(?<scheme>[a-zA-Z]([a-zA-Z0-9+.-])*):(?<hier_part>\/\/((?<userinfo>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:)*@)?(?<host>((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?<reg_name>([a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=])*))(:(?<port>[0-9]*))?(?<path>(?<path_abempty>(\/(?<segment_path_abempty>((?<pchar_segment_path_abempty>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*)|(?<path_absolute>(\/(?<segment_nz_path_absolute>((?<pchar_segment_nz_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_absolute>((?<pchar_segment_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))+?))|(?<path_noscheme>((?<segment_nz_nc_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|@)(\/(?<segment_path_noscheme>((?<pchar_segment_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?<path_rootless>((?<segment_nz_path_rootless>((?<pchar_segment_nz_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_rootless>((?<pchar_segment_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?:)))(\?(?<query>((?<pchar_query>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?(#(?<fragment>((?<pchar_fragment>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?/

実行結果:

❯ node .
[
  'https://zenn.dev/yuimaru/scraps/8ce132bc8aa6c9?k=v&i#unko',
  'https',
  's',
  '//zenn.dev/yuimaru/scraps/8ce132bc8aa6c9',
  undefined,
  undefined,
  'zenn.dev',
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  'zenn.dev',
  'v',
  undefined,
  undefined,
  '/yuimaru/scraps/8ce132bc8aa6c9',
  '/yuimaru/scraps/8ce132bc8aa6c9',
  '/8ce132bc8aa6c9',
  '8ce132bc8aa6c9',
  '9',
  '9',
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  undefined,
  '?k=v&i',
  'k=v&i',
  'i',
  'i',
  '#unko',
  'unko',
  'o',
  'o',
  index: 0,
  input: 'https://zenn.dev/yuimaru/scraps/8ce132bc8aa6c9?k=v&i#unko',
  groups: [Object: null prototype] {
    scheme: 'https',
    hier_part: '//zenn.dev/yuimaru/scraps/8ce132bc8aa6c9',
    userinfo: undefined,
    host: 'zenn.dev',
    reg_name: 'zenn.dev',
    port: undefined,
    path: '/yuimaru/scraps/8ce132bc8aa6c9',
    path_abempty: '/yuimaru/scraps/8ce132bc8aa6c9',
    segment_path_abempty: '8ce132bc8aa6c9',
    pchar_segment_path_abempty: '9',
    path_absolute: undefined,
    segment_nz_path_absolute: undefined,
    pchar_segment_nz_path_absolute: undefined,
    segment_path_absolute: undefined,
    pchar_segment_path_absolute: undefined,
    path_noscheme: undefined,
    segment_nz_nc_path_noscheme: undefined,
    segment_path_noscheme: undefined,
    pchar_segment_path_noscheme: undefined,
    path_rootless: undefined,
    segment_nz_path_rootless: undefined,
    pchar_segment_nz_path_rootless: undefined,
    segment_path_rootless: undefined,
    pchar_segment_path_rootless: undefined,
    query: 'k=v&i',
    pchar_query: 'i',
    fragment: 'unko',
    pchar_fragment: 'o'
  }
]
yuimaruyuimaru

クソの IPv4 アドレスのついでにクソの IPv6 アドレスを追加して 4829 文字の正規表現が爆誕!

yuimaruyuimaru

生成に使ったもの

// @ts-check

const source = (/** @type {any[]} */ ...r) =>
  r
    .map((x) =>
      typeof x === "string" ? x : x instanceof RegExp ? x.source : String(x),
    )
    .join("");
const group = (name, /** @type {(string | RegExp)[]} */ ...r) => {
  return (
    "(" +
    (name ? "?<" + name + ">" : "") +
    r.map((x) => source(x)).join("|") +
    ")"
  );
};
const opt = (/** @type {string} */ r) => source(r) + "?";
export const gen_delims = new RegExp("[:/?#[]@]");
export const sub_delims = new RegExp("[!$&'()*+,;=]");
export const reserved = new RegExp(group("reserved", gen_delims, sub_delims));
export const unserved = new RegExp("[a-zA-Z0-9._~-]");
export const pct_encoded = new RegExp("%[0-9a-fA-F]{2}");

export const scheme = new RegExp(
  group("scheme", "[a-zA-Z]" + group("", "[a-zA-Z0-9+.-]") + "*"),
);

export const userinfo = new RegExp(
  group("userinfo", unserved, pct_encoded, sub_delims, ":") + "*",
);
export const reg_name = new RegExp(
  group("reg_name", group("", unserved, pct_encoded, sub_delims) + "*"),
);
let dec_octetC = 0;
export const dec_octet = () =>
  new RegExp(
    group(
      "dec_octet_" + (dec_octetC++).toString(),
      group(
        "",
        new RegExp("[0-9]"),
        new RegExp("[1-9][0-9]"),
        new RegExp("1[0-9][0-9]"),
        new RegExp("2[0-4][0-9]"),
        new RegExp("25[0-5]"),
      ),
    ),
  );

let ipv4addressC = 0;
export const ipv4address = () =>
  new RegExp(
    group(
      "ipv4address_" + (ipv4addressC++).toString(),
      group(
        "",
        source(dec_octet()) +
          "\\." +
          source(dec_octet()) +
          "\\." +
          source(dec_octet()) +
          "\\." +
          source(dec_octet()),
      ),
    ),
  );
let h16C = 0;
export const h16 = () =>
  new RegExp(
    group("h16_" + (h16C++).toString(), group("", "[0-9a-fA-F]{1,4}")),
  );
let ls32C = 0;

export const ls32 = () =>
  new RegExp(
    group(
      "ls32_" + (ls32C++).toString(),
      group("", source(h16()) + ":" + source(h16()), ipv4address()),
    ),
  );
export const ipv6address = new RegExp(
  group(
    "ipv6address",
    group(
      "",
      group("", source(h16()) + ":") + "{6}" + source(ls32()),
      "::" + group("", source(h16()) + ":") + "{5}" + source(ls32()),
      opt(group("", h16())) +
        "::" +
        group("", source(h16()) + ":") +
        "{4}" +
        source(ls32()),
      opt(group("", group("", source(h16()) + ":") + "{0,1}" + source(h16()))) +
        "::" +
        group("", source(h16()) + ":") +
        "{3}" +
        source(ls32()),
      opt(group("", group("", source(h16()) + ":") + "{0,2}" + source(h16()))) +
        "::" +
        group("", source(h16()) + ":") +
        "{2}" +
        source(ls32()),
      opt(group("", group("", source(h16()) + ":") + "{0,3}" + source(h16()))) +
        "::" +
        source(h16()) +
        ":" +
        source(ls32()),
      opt(group("", group("", source(h16()) + ":") + "{0,4}" + source(h16()))) +
        "::" +
        source(ls32()),
      opt(group("", group("", source(h16()) + ":") + "{0,5}" + source(h16()))) +
        "::" +
        source(h16()),
      opt(group("", group("", source(h16()) + ":") + "{0,6}" + source(h16()))) +
        "::",
    ),
  ),
);
export const ipliteral = new RegExp("\\[" + source(ipv6address) + "\\]");
export const host = new RegExp(
  group("host", ipliteral, ipv4address(), reg_name),
);
export const port = new RegExp(group("port", "[0-9]*"));
export const authority = new RegExp(
  opt(group("", source(userinfo) + "@")) +
    source(host) +
    opt(group("", ":" + source(port))),
);

export const pchar = (/** @type {string} */ name) =>
  new RegExp(
    group("pchar_" + name, unserved, pct_encoded, sub_delims, ":", "@"),
  );
export const segment = (/** @type {string} */ name) =>
  new RegExp(
    group("segment_" + name, group("", pchar("segment_" + name)) + "*"),
  );
export const segment_nz = (/** @type {string} */ name) =>
  new RegExp(
    group("segment_nz_" + name, group("", pchar("segment_nz_" + name)) + "+"),
  );
export const segment_nz_nc = (/** @type {string} */ name) =>
  new RegExp(
    group("segment_nz_nc_" + name, unserved, pct_encoded, sub_delims, "@"),
  );

export const path_abempty = new RegExp(
  group("path_abempty", group("", "/" + source(segment("path_abempty"))) + "*"),
);
export const path_absolute = new RegExp(
  group(
    "path_absolute",
    group(
      "",
      "/" +
        opt(
          source(segment_nz("path_absolute")) +
            group("", "/" + source(segment("path_absolute"))) +
            "+",
        ),
    ),
  ),
);
export const path_noscheme = new RegExp(
  group(
    "path_noscheme",
    group(
      "",
      source(segment_nz_nc("path_noscheme")) +
        group("", "/" + source(segment("path_noscheme"))) +
        "*",
    ),
  ),
);
export const path_rootless = new RegExp(
  group(
    "path_rootless",
    group(
      "",
      source(segment_nz("path_rootless")) +
        group("", "/" + source(segment("path_rootless"))) +
        "*",
    ),
  ),
);
export const path = new RegExp(
  group(
    "path",
    path_abempty,
    path_absolute,
    path_noscheme,
    path_rootless,
    "(?:)",
  ),
);

export const query = new RegExp(
  group("query", group("", pchar("query"), "/", "\\?") + "*"),
);
export const fragment = new RegExp(
  group("fragment", group("", pchar("fragment"), "/", "\\?") + "*"),
);

export const hier_part = new RegExp(
  group("hier_part", "//" + source(authority) + source(path)),
);

export const absolute_uri = new RegExp(
  source(scheme) + ":" + source(hier_part) + opt(group("", "\\?" + query)),
);
export const uri = new RegExp(
  source(scheme) +
    ":" +
    source(hier_part) +
    opt(group("", "\\?" + source(query))) +
    opt(group("", "#" + source(fragment))),
);

成果物

/(?<scheme>[a-zA-Z]([a-zA-Z0-9+.-])*):(?<hier_part>\/\/((?<userinfo>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:)*@)?(?<host>\[(?<ipv6address>(((?<h16_0>([0-9a-fA-F]{1,4})):){6}(?<ls32_0>((?<h16_1>([0-9a-fA-F]{1,4})):(?<h16_2>([0-9a-fA-F]{1,4}))|(?<ipv4address_0>((?<dec_octet_0>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_1>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_2>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_3>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|::((?<h16_3>([0-9a-fA-F]{1,4})):){5}(?<ls32_1>((?<h16_4>([0-9a-fA-F]{1,4})):(?<h16_5>([0-9a-fA-F]{1,4}))|(?<ipv4address_1>((?<dec_octet_4>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_5>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_6>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_7>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|((?<h16_6>([0-9a-fA-F]{1,4})))?::((?<h16_7>([0-9a-fA-F]{1,4})):){4}(?<ls32_2>((?<h16_8>([0-9a-fA-F]{1,4})):(?<h16_9>([0-9a-fA-F]{1,4}))|(?<ipv4address_2>((?<dec_octet_8>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_9>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_10>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_11>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_10>([0-9a-fA-F]{1,4})):){0,1}(?<h16_11>([0-9a-fA-F]{1,4})))?::((?<h16_12>([0-9a-fA-F]{1,4})):){3}(?<ls32_3>((?<h16_13>([0-9a-fA-F]{1,4})):(?<h16_14>([0-9a-fA-F]{1,4}))|(?<ipv4address_3>((?<dec_octet_12>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_13>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_14>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_15>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_15>([0-9a-fA-F]{1,4})):){0,2}(?<h16_16>([0-9a-fA-F]{1,4})))?::((?<h16_17>([0-9a-fA-F]{1,4})):){2}(?<ls32_4>((?<h16_18>([0-9a-fA-F]{1,4})):(?<h16_19>([0-9a-fA-F]{1,4}))|(?<ipv4address_4>((?<dec_octet_16>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_17>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_18>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_19>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_20>([0-9a-fA-F]{1,4})):){0,3}(?<h16_21>([0-9a-fA-F]{1,4})))?::(?<h16_22>([0-9a-fA-F]{1,4})):(?<ls32_5>((?<h16_23>([0-9a-fA-F]{1,4})):(?<h16_24>([0-9a-fA-F]{1,4}))|(?<ipv4address_5>((?<dec_octet_20>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_21>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_22>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_23>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_25>([0-9a-fA-F]{1,4})):){0,4}(?<h16_26>([0-9a-fA-F]{1,4})))?::(?<ls32_6>((?<h16_27>([0-9a-fA-F]{1,4})):(?<h16_28>([0-9a-fA-F]{1,4}))|(?<ipv4address_6>((?<dec_octet_24>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_25>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_26>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_27>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))))|(((?<h16_29>([0-9a-fA-F]{1,4})):){0,5}(?<h16_30>([0-9a-fA-F]{1,4})))?::(?<h16_31>([0-9a-fA-F]{1,4}))|(((?<h16_32>([0-9a-fA-F]{1,4})):){0,6}(?<h16_33>([0-9a-fA-F]{1,4})))?::))\]|(?<ipv4address_7>((?<dec_octet_28>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_29>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_30>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))\.(?<dec_octet_31>([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]))))|(?<reg_name>([a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=])*))(:(?<port>[0-9]*))?(?<path>(?<path_abempty>(\/(?<segment_path_abempty>((?<pchar_segment_path_abempty>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*)|(?<path_absolute>(\/(?<segment_nz_path_absolute>((?<pchar_segment_nz_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_absolute>((?<pchar_segment_path_absolute>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))+?))|(?<path_noscheme>((?<segment_nz_nc_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|@)(\/(?<segment_path_noscheme>((?<pchar_segment_path_noscheme>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?<path_rootless>((?<segment_nz_path_rootless>((?<pchar_segment_nz_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))+)(\/(?<segment_path_rootless>((?<pchar_segment_path_rootless>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@))*))*))|(?:)))(\?(?<query>((?<pchar_query>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?(#(?<fragment>((?<pchar_fragment>[a-zA-Z0-9._~-]|%[0-9a-fA-F]{2}|[!$&'()*+,;=]|:|@)|\/|\?)*))?/

もはや group がいっぱい付いていてお得!wぐらいのノリでしか使えない気がする.

泣いていいかな,これ.

yuimaruyuimaru

仕様守れているかはもうわかりません.寝るか

このスクラップは2024/01/17にクローズされました