refractorやlowlightで行番号を出したかった

Node.jsで動くシンタックスハイライトライブラリを調査する

refractor

4.0.0からESMになっているので、CommonJSで使いたい場合は3.3.1を入れる。
ここでは3.3.1を試す。

使い方

import refractor from "refractor";
import { inspect } from "util";

const root = refractor.highlight('const foo = "Hello World";', "js");

console.log(inspect(root, { depth: Infinity }));

出力結果

結果

[
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'keyword' ] },
    children: [ { type: 'text', value: 'const' } ]
  },
  { type: 'text', value: ' foo ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'operator' ] },
    children: [ { type: 'text', value: '=' } ]
  },
  { type: 'text', value: ' ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'string' ] },
    children: [ { type: 'text', value: '"Hello World"' } ]
  },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: ';' } ]
  }
]

上のようにrefractor.highlightでは仮想ノードが得られる。

複数行のコードをハイライトする例

import refractor from "refractor";
import { inspect } from "util";

const root = refractor.highlight(
  `
/**
 * Hello World
 */
function helloWorld() {
  return "Hello World";
}
`,
  "js"
);

console.log(inspect(root, { depth: Infinity }));

出力結果

[
  { type: 'text', value: '\n' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'doc-comment', 'comment' ] },
    children: [ { type: 'text', value: '/**\n * Hello World\n */' } ]
  },
  { type: 'text', value: '\n' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'keyword' ] },
    children: [ { type: 'text', value: 'function' } ]
  },
  { type: 'text', value: ' ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'function' ] },
    children: [ { type: 'text', value: 'helloWorld' } ]
  },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: '(' } ]
  },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: ')' } ]
  },
  { type: 'text', value: ' ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: '{' } ]
  },
  { type: 'text', value: '\n  ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'keyword', 'control-flow' ] },
    children: [ { type: 'text', value: 'return' } ]
  },
  { type: 'text', value: ' ' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'string' ] },
    children: [ { type: 'text', value: '"Hello World"' } ]
  },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: ';' } ]
  },
  { type: 'text', value: '\n' },
  {
    type: 'element',
    tagName: 'span',
    properties: { className: [ 'token', 'punctuation' ] },
    children: [ { type: 'text', value: '}' } ]
  },
  { type: 'text', value: '\n' }
]

改行情報は普通に\nで表される

suin

refractorは行番号を表現する方法がないっぽい

Grouping by line · Issue #8 · wooorm/refractor

上のissuesにあった解決策、ハイライト前に\nごとにソースコードをチャンク分けして、チャンクごとにハイライトする処理だが、これはうまくいかなそう。複数行のコメント/**/やテンプレート文字列などでチャンク分けが乱暴すぎる場合が考えられる。

ASTをwalkして\nに出会うごとに<span>ノードで囲む方法も考えたが、refractorのASTは一見するとフラットだけど、テンプレート文字列などは階層構造になっててやりずらそうだった。

template-string.js

`string
  ${`string
    ${`string`}
  `}
`;

import fs from "fs";
import refractor from "refractor";
import unistInspect from "unist-util-inspect";

const node = refractor.highlight(
  fs.readFileSync("./template-string.js", "utf8"),
  "js"
);
console.log(unistInspect(node as any));

出力結果

├─0 element<span>[5]
│   │ properties: {"className":["token","template-string"]}
│   ├─0 element<span>[1]
│   │   │ properties: {"className":["token","template-punctuation","string"]}
│   │   └─0 text "`"
│   ├─1 element<span>[1]
│   │   │ properties: {"className":["token","string"]}
│   │   └─0 text "string\n  "
│   ├─2 element<span>[3]
│   │   │ properties: {"className":["token","interpolation"]}
│   │   ├─0 element<span>[1]
│   │   │   │ properties: {"className":["token","interpolation-punctuation","punctuation"]}
│   │   │   └─0 text "${"
│   │   ├─1 element<span>[5]
│   │   │   │ properties: {"className":["token","template-string"]}
│   │   │   ├─0 element<span>[1]
│   │   │   │   │ properties: {"className":["token","template-punctuation","string"]}
│   │   │   │   └─0 text "`"
│   │   │   ├─1 element<span>[1]
│   │   │   │   │ properties: {"className":["token","string"]}
│   │   │   │   └─0 text "string\n    "
│   │   │   ├─2 element<span>[3]
│   │   │   │   │ properties: {"className":["token","interpolation"]}
│   │   │   │   ├─0 element<span>[1]
│   │   │   │   │   │ properties: {"className":["token","interpolation-punctuation","punctuation"]}
│   │   │   │   │   └─0 text "${"
│   │   │   │   ├─1 element<span>[3]
│   │   │   │   │   │ properties: {"className":["token","template-string"]}
│   │   │   │   │   ├─0 element<span>[1]
│   │   │   │   │   │   │ properties: {"className":["token","template-punctuation","string"]}
│   │   │   │   │   │   └─0 text "`"
│   │   │   │   │   ├─1 element<span>[1]
│   │   │   │   │   │   │ properties: {"className":["token","string"]}
│   │   │   │   │   │   └─0 text "string"
│   │   │   │   │   └─2 element<span>[1]
│   │   │   │   │       │ properties: {"className":["token","template-punctuation","string"]}
│   │   │   │   │       └─0 text "`"
│   │   │   │   └─2 element<span>[1]
│   │   │   │       │ properties: {"className":["token","interpolation-punctuation","punctuation"]}
│   │   │   │       └─0 text "}"
│   │   │   ├─3 element<span>[1]
│   │   │   │   │ properties: {"className":["token","string"]}
│   │   │   │   └─0 text "\n  "
│   │   │   └─4 element<span>[1]
│   │   │       │ properties: {"className":["token","template-punctuation","string"]}
│   │   │       └─0 text "`"
│   │   └─2 element<span>[1]
│   │       │ properties: {"className":["token","interpolation-punctuation","punctuation"]}
│   │       └─0 text "}"
│   ├─3 element<span>[1]
│   │   │ properties: {"className":["token","string"]}
│   │   └─0 text "\n"
│   └─4 element<span>[1]
│       │ properties: {"className":["token","template-punctuation","string"]}
│       └─0 text "`"
├─1 element<span>[1]
│   │ properties: {"className":["token","punctuation"]}
│   └─0 text ";"
└─2 text "\n"

suin

lowlight

refractorと同じ作者が作っているhighlight.jsベースのライブラリ
2系からはESM。CommonJSは1系を使う。

ここでは1系を使ってみる。

pnpm add lowlight@^1
pnpm add -D @types/lowlight

suin

lowlightのシンプルな使い方

import low from "lowlight";
import unistUtilInspect from "unist-util-inspect";

const tree = low.highlight("js", '"use strict";').value;

console.log(unistUtilInspect(tree as any));

出力結果

├─0 element<span>[1]
│   │ properties: {"className":["hljs-meta"]}
│   └─0 text "\"use strict\""
└─1 text ";"

suin

lowlightも行番号を出すのはつらそう

lowlightもrefractor同様にASTが入れ子状になるので、改行を検出してグルーピングするような処理は面倒そう。

import fs from "fs";
import low from "lowlight";
import unistUtilInspect from "unist-util-inspect";

const tree = low.highlight(
  "js",
  fs.readFileSync("./template-string.js", "utf8")
).value;

console.log(unistUtilInspect.noColor(tree as any));

出力結果

├─0 element<span>[3]
│   │ properties: {"className":["hljs-string"]}
│   ├─0 text "`string\n  "
│   ├─1 element<span>[3]
│   │   │ properties: {"className":["hljs-subst"]}
│   │   ├─0 text "${"
│   │   ├─1 element<span>[3]
│   │   │   │ properties: {"className":["hljs-string"]}
│   │   │   ├─0 text "`string\n    "
│   │   │   ├─1 element<span>[3]
│   │   │   │   │ properties: {"className":["hljs-subst"]}
│   │   │   │   ├─0 text "${"
│   │   │   │   ├─1 element<span>[1]
│   │   │   │   │   │ properties: {"className":["hljs-string"]}
│   │   │   │   │   └─0 text "`string`"
│   │   │   │   └─2 text "}"
│   │   │   └─2 text "\n  `"
│   │   └─2 text "}"
│   └─2 text "\n`"
└─1 text ";\n"

suin

refractorのASTに行情報を付与するための試行錯誤

refractorのASTはツリー構造なので行ごとにトークンをグルーピングするのが難しい

なので

ASTをフラットにする
その上で、行ごとにグルーピングする処理を加える

を方針として、行情報付与をやってみる

suin

refractorのASTをフラットにする処理

template-string.js

`string`;

↑上のJavaScriptコードをハイライトしてみる

import fs from "fs";
import refractor from "refractor";
import { RefractorNode } from "refractor/core";
import unistInspect from "unist-util-inspect";

const toHtml = require("hast-util-to-html");

const node = refractor.highlight(
  fs.readFileSync("./template-string.js", "utf8"),
  "js"
);

function flatten(
  nodes: RefractorNode[],
  className?: ReadonlySet<string>
): RefractorNode[] {
  return nodes.reduce<RefractorNode[]>(
    (acc, node) =>
      acc.concat(
        node.type === "element"
          ? flatten(
              node.children,
              new Set([
                ...(className || []),
                ...(node.properties.className || []),
              ])
            )
          : className
          ? {
              type: "element",
              tagName: "span",
              properties: { className: [...className] },
              children: [node],
            }
          : node
      ),
    []
  );
}

console.log(unistInspect(node as any));
console.log(toHtml({ type: "root", children: node }));

const flat = flatten(node);

console.log(unistInspect(flat as any));
console.log(toHtml({ type: "root", children: flat }));

出力結果

before

├─0 element<span>[3]
│   │ properties: {"className":["token","template-string"]}
│   ├─0 element<span>[1]
│   │   │ properties: {"className":["token","template-punctuation","string"]}
│   │   └─0 text "`"
│   ├─1 element<span>[1]
│   │   │ properties: {"className":["token","string"]}
│   │   └─0 text "string"
│   └─2 element<span>[1]
│       │ properties: {"className":["token","template-punctuation","string"]}
│       └─0 text "`"
├─1 element<span>[1]
│   │ properties: {"className":["token","punctuation"]}
│   └─0 text ";"
└─2 text "\n"
<span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string">string</span><span class="token template-punctuation string">`</span></span><span class="token punctuation">;</span>

after

├─0 element<span>[1]
│   │ properties: {"className":["token","template-string","template-punctuation","string"]}
│   └─0 text "`"
├─1 element<span>[1]
│   │ properties: {"className":["token","template-string","string"]}
│   └─0 text "string"
├─2 element<span>[1]
│   │ properties: {"className":["token","template-string","template-punctuation","string"]}
│   └─0 text "`"
├─3 element<span>[1]
│   │ properties: {"className":["token","punctuation"]}
│   └─0 text ";"
└─4 text "\n"
<span class="token template-string template-punctuation string">`</span><span class="token template-string string">string</span><span class="token template-string template-punctuation string">`</span><span class="token punctuation">;</span>

suin

フラット化すると、ASTの構造が次のようなかなりシンプルなものになる。

type FlatNodes = FlatNode[];

type FlatNode = FlatElement | FlatText;

type FlatElement = {
  type: "element";
  tagName: "span";
  properties: { className: string[] };
  children: [FlatText]; // 無限の配列がtextノード1要素のタプルになる
};

type FlatText = {
  type: "text";
  value: 改行が含まれてるかもしれない文字列;
};

type 改行が含まれてるかもしれない文字列 = string;

このFlatNodesをループしていって、改行が含まれてるかもしれない文字列を検査し、改行を見つけたらチャンクに区切っていけば良さそう。

suin

フラット化したノードを行ごとにグルーピングする処理

function groupByLines(nodes: FlatNodes): Array<LineElement> {
  const lineElements: Array<LineElement> = [];
  let currentLine = createLineElement(1);
  for (const node of nodes) {
    const lines = splitByLines(node);
    const last = lines.length - 1;
    lines.forEach((line, index) => {
      if (line.type !== "text" || line.value !== "") {
        currentLine.children.push(line);
      }
      if (index !== last) {
        lineElements.push(currentLine);
        currentLine = createLineElement(lineElements.length + 1);
      }
    });
  }
  if (currentLine.children.length > 0) {
    lineElements.push(currentLine);
  }
  return lineElements;
}

type LineElement = {
  type: "element";
  tagName: "span";
  properties: { ["data-line-number"]: number };
  children: FlatNodes;
};

function createLineElement(lineNumber: number): LineElement {
  return {
    type: "element",
    tagName: "span",
    properties: { ["data-line-number"]: lineNumber },
    children: [],
  };
}

function splitByLines(node: FlatNode): FlatNodes {
  if (node.type === "text") {
    return splitTextByLines(node);
  } else {
    const texts = splitTextByLines(node.children[0]);
    return texts.map<FlatElement>((text) => ({ ...node, children: [text] }));
  }
}

function splitTextByLines(text: FlatText): Array<FlatText> {
  if (text.value.length === 0) {
    return [text];
  }
  const values = text.value.split("\n");
  const last = values.length - 1;
  return values.map<FlatText>((value, index) => ({
    type: "text",
    value: value + (index === last ? "" : "\n"),
  }));
}

suin

上の処理に次のコードを与える

/**
 * コメント
 */
`文字列1行目
文字列2行目
文字列3行目`;

まず、const tree = refractor.highlight(上のコード, "js")を通したときの構造が↓のようになる:

treeの中身

├─0 element<span>[1]
│   │ properties: {"className":["token","doc-comment","comment"]}
│   └─0 text "/**\n * コメント\n */"
├─1 text "\n"
├─2 element<span>[3]
│   │ properties: {"className":["token","template-string"]}
│   ├─0 element<span>[1]
│   │   │ properties: {"className":["token","template-punctuation","string"]}
│   │   └─0 text "`"
│   ├─1 element<span>[1]
│   │   │ properties: {"className":["token","string"]}
│   │   └─0 text "文字列1行目\n文字列2行目\n文字列3行目"
│   └─2 element<span>[1]
│       │ properties: {"className":["token","template-punctuation","string"]}
│       └─0 text "`"
├─3 element<span>[1]
│   │ properties: {"className":["token","punctuation"]}
│   └─0 text ";"
└─4 text "\n"

HTML

<span class="token doc-comment comment">/**
 * コメント
 */</span>
<span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string">文字列1行目
文字列2行目
文字列3行目</span><span class="token template-punctuation string">`</span></span><span class="token punctuation">;</span>

この段階では、構造がフラットではないので、次にこの構造をフラット化する

const flat = flatten(tree);

フラット化された構造:

flatの中身

├─0 element<span>[1]
│   │ properties: {"className":["token","doc-comment","comment"]}
│   └─0 text "/**\n * コメント\n */"
├─1 text "\n"
├─2 element<span>[1]
│   │ properties: {"className":["token","template-string","template-punctuation","string"]}
│   └─0 text "`"
├─3 element<span>[1]
│   │ properties: {"className":["token","template-string","string"]}
│   └─0 text "文字列1行目\n文字列2行目\n文字列3行目"
├─4 element<span>[1]
│   │ properties: {"className":["token","template-string","template-punctuation","string"]}
│   └─0 text "`"
├─5 element<span>[1]
│   │ properties: {"className":["token","punctuation"]}
│   └─0 text ";"
└─6 text "\n"

HTML

<span class="token doc-comment comment">/**
 * コメント
 */</span>
<span class="token template-string template-punctuation string">`</span><span class="token template-string string">文字列1行目
文字列2行目
文字列3行目</span><span class="token template-string template-punctuation string">`</span><span class="token punctuation">;</span>

HTMLとしての表示はフラット前と大差がない。

そして、フラット化した構造を行単位でグルーピングする処理

const lines = groupByLines(flat);

linesの中身

├─0 element<span>[1]
│   │ properties: {"data-line-number":1}
│   └─0 element<span>[1]
│       │ properties: {"className":["token","doc-comment","comment"]}
│       └─0 text "/**\n"
├─1 element<span>[1]
│   │ properties: {"data-line-number":2}
│   └─0 element<span>[1]
│       │ properties: {"className":["token","doc-comment","comment"]}
│       └─0 text " * コメント\n"
├─2 element<span>[2]
│   │ properties: {"data-line-number":3}
│   ├─0 element<span>[1]
│   │   │ properties: {"className":["token","doc-comment","comment"]}
│   │   └─0 text " */"
│   └─1 text "\n"
├─3 element<span>[2]
│   │ properties: {"data-line-number":4}
│   ├─0 element<span>[1]
│   │   │ properties: {"className":["token","template-string","template-punctuation","string"]}
│   │   └─0 text "`"
│   └─1 element<span>[1]
│       │ properties: {"className":["token","template-string","string"]}
│       └─0 text "文字列1行目\n"
├─4 element<span>[1]
│   │ properties: {"data-line-number":5}
│   └─0 element<span>[1]
│       │ properties: {"className":["token","template-string","string"]}
│       └─0 text "文字列2行目\n"
└─5 element<span>[4]
    │ properties: {"data-line-number":6}
    ├─0 element<span>[1]
    │   │ properties: {"className":["token","template-string","string"]}
    │   └─0 text "文字列3行目"
    ├─1 element<span>[1]
    │   │ properties: {"className":["token","template-string","template-punctuation","string"]}
    │   └─0 text "`"
    ├─2 element<span>[1]
    │   │ properties: {"className":["token","punctuation"]}
    │   └─0 text ";"
    └─3 text "\n"

HTML

<span data-line-number="1"
  ><span class="token doc-comment comment">/** </span></span
><span data-line-number="2"
  ><span class="token doc-comment comment"> * コメント </span></span
><span data-line-number="3"
  ><span class="token doc-comment comment"> */</span> </span
><span data-line-number="4"
  ><span class="token template-string template-punctuation string">`</span
  ><span class="token template-string string">文字列1行目 </span></span
><span data-line-number="5"
  ><span class="token template-string string">文字列2行目 </span></span
><span data-line-number="6"
  ><span class="token template-string string">文字列3行目</span
  ><span class="token template-string template-punctuation string">`</span
  ><span class="token punctuation">;</span>
</span>

※行単位で<span>で囲まれているのが分かりやすいようにprettierで整形しています。

行単位でグルーピングできた。

suin

ここまでできたし、npmモジュール化しようかな？

suin

flattenをnpmパッケージ化した: suin/refractor-flatten: A utility to transform Refractor ASTs to flat arrays.

suin

group-by-linesをnpmパッケージ化した: suin/refractor-group-by-lines: A utility to embed line information to Refractor ASTs