iTranslated by AI
Comparing Bulk and Line-by-Line Processing for JavaScript String Replacement
I am syuribox, and I develop HTML tools for the web using Vanilla JavaScript.
Let's assume an environment where only local HTML/CSS/JS files or file references on a server are available. Suppose there is row-based processing for text ranging from an average of 100,000 characters to a maximum of 1,000,000 characters. If one line contains 20 characters, that results in anywhere from 5,000 to 500,000 lines. This data consists of text such as novel manuscripts, blog posts, or news articles (this also applies to log data analysis depending on the use case).
The program has an input form with a textarea where you copy-paste text or load a file, and it performs the necessary processing on that input line by line, then reports the results.
In this scenario, I would like to explore which approach is smarter: "performing bulk processing on the entire data first" or "performing processing line by line at the beginning of each row's operation."
Prediction (or Wishful Thinking)
My wishful thinking was that processing line by line might be faster because it only creates a copy of the line when a replacement is needed.
In a C/C++ program where you haven't given it much thought, this prediction might have been correct. However, since JavaScript is a language equipped with a Garbage Collector (GC), the situation is different. So, let's put it to the test.
Benchmark Comparison: Bulk vs. Line-by-Line
function $(id){
return document.getElementById(id);
}
const myPrereplaceMap = {
'咒':'呪', '堽':'岡', '崕':'崖', '彌':'弥', '曾':'曽',
'溯':'遡', '璢':'瑠', '瘦':'痩', '篭':'籠', '艷':'艶',
'葢':'蓋', '餠':'餅', '麵':'麺', '龜':'亀', '𥡴':'稽',
'剝':'剥', '塡':'填', '頰':'頬', '𠮟':'叱'
};
function preProcessOptimized(str) {
// htmlEscape + Kanji replacement in one pass
return str
.replace(/[\u0026\u003c\u003e'"]/g, match => {
switch(match) {
case '\u0026': return '\u0026amp;';
case '\u003c': return '\u0026lt;';
case '\u003e': return '\u0026gt;';
case "'": return '\u0026#39;';
case '"': return '\u0026#34;';
}
return match;
})
.replace(/[咒堽崕彌曾溯璢瘦篭艷葢餠麵龜𥡴剝塡頰𠮟]/ug, match => {
return myPrereplaceMap[match] || match;
});
}
function htmlEscape(str){
return str.replace(/\u0026/g,"\u0026amp;")
.replace(/\u003c/g, "\u0026lt;")
.replace(/\u003e/g, "\u0026gt;")
.replace(/'/g, '\u0026#39;')
.replace(/"/g, '\u0026#34;');
}
function preProcess(str){
str = htmlEscape(str);
const befores = '咒堽崕彌曾溯璢瘦篭艷葢餠麵龜𥡴剝塡頰';
const afters = '呪岡崖弥曽遡瑠痩籠艶蓋餅麺亀稽剥填頬';
for(let i = 0; i < befores.length; i++){
str = str.replaceAll(befores[i], afters[i]);
}
str = str.replaceAll(
'𠮟',// U+20B9F
'叱' // U+53F1
);
return str;
}
function mainProcess(line){
line = line.replaceAll(
'さんぷるA',
'さんぷるB')
.replaceAll(
`サンプルA`,
`サンプルB`
)
.replaceAll(
`狂`,
`狂■[注意]`
);
}
function myProcessAll(text){
text = preProcess(text);
const lines = text.split('\n');
const outs = [];
for(let line of lines){
line = mainProcess(line);
outs.push(line);
}
return outs.join('\n');
}
function myProcessLine(text, useOptimized){
const lines = text.split('\n');
const outs = [];
for(let line of lines){
if(useOptimized){
line = preProcessOptimized(line);
}else{
line = preProcess(line);
}
line = mainProcess(line);
outs.push(line);
}
return outs.join('\n');
}
function myProcessNoLine(text, useOptimized){
text = preProcess(text);
text = mainProcess(text);
return text;
}
function counterbyTime(ms, unit, limit, proc, param1, param2){
let time = performance.now();
let now = time;
let count = 0;
for(;count < limit && now - time < ms; count++){
for(let i = 0; i < unit; i++){
proc(param1, param2);
}
time = now;
now = performance.now();
}
return count;
}
function myReplaceAllorLineBench(){
const textOrg = `
This is a Japanese test dataset.
Simulating a replacement program for novels, columns, or essays.
In the replacement text, "𠮟る" is integrated into "叱る".
This is because U+20B9F is often not processed correctly as a surrogate pair,
or detected as an invalid value during data validation.
Also, while variants of new kanji are integrated, the three JIS Level 3 characters
"剝塡頰" are replaced with variants within JIS Level 2.
In the main process, characters to check are replaced or output with warning marks.
Milk is best for people feeling down. さんぷるA
That being said, we encounter this often.`;
const ms = 5; // Extremely short times like 1ms can be unstable
const unit = 2;
const limit = 10000;
const text = (new Array(300)).join(textOrg);
let out = '';
const procs = (i) => {
const all = counterbyTime(ms, unit, limit, myProcessAll, text, undefined);
const line = counterbyTime(ms, unit, limit, myProcessLine, text, false);
const optz = counterbyTime(ms, unit, limit, myProcessLine, text, true);
const noLine = counterbyTime(ms, unit, limit, myProcessNoLine, text, undefined);
out += `A:${all} B:${line} C:${optz} D:${noLine} times/${ms}ms ${i}th run\n`;
$('result').innerText = out;
if(i < 5 ){
window.setTimeout( (() => procs(i + 1)), 1000);
}
};
window.setTimeout((() => procs(1)), 1000);
}
$('myFire').addEventListener('click', () => myReplaceAllorLineBench());
<!DOCTYPE html>
<html lang="ja">
<head>
<meta charset="utf-8">
<title>Comparison of bulk and line-by-line preprocessing with String.replaceAll</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<h1>Comparison of bulk and line-by-line preprocessing with String.replaceAll</h1>
<form>
<button type="button" id='myFire'>
Run Test
</button>
</form>
<div id="result"></div>
<script type="text/javascript" src="line_all_replace.js" charset="utf-8"></script>
</body>
</html>
I initially used CodePen, but it felt slow. Since loop-heavy processing seems to get throttled, I switched to local execution.
Results
Firefox 149 / Windows11 x64
A:9532 B:1 C:720 D:4861 times/5ms 1st run
A:8282 B:1 C:723 D:9659 times/5ms 2nd run
A:10000 B:1 C:371 D:10000 times/5ms 3rd run
A:1 B:1 C:2 D:10000 times/5ms 4th run
A:1 B:1 C:1706 D:10000 times/5ms 5th run
Chrome 147 / Windows11 x64
A:1 B:1 C:2172 D:9890 times/5ms 1st run
A:6407 B:1 C:978 D:5377 times/5ms 2nd run
A:1 B:1 C:6233 D:10000 times/5ms 3rd run
A:1 B:1 C:677 D:3459 times/5ms 4th run
A:6856 B:1 C:566 D:10000 times/5ms 5th run
A: Bulk replacement: Reasonably fast
B: Line-by-line: Extremely slow
C: Optimized line-by-line: Moderately fast
D: Bulk replacement including main process: Fastest
What we can learn from this is that calling replaceAll or replace(/~~/g) repeatedly to perform massive bulk replacements is slow.
Reason 1: CPU Cache
This is a hypothesis, but let's analyze it. When calling replace multiple times, you are performing a full scan of the entire text each time. In other words, "total character count * number of calls" worth of memory access occurs. If this doesn't fit in the CPU cache, the speed drops drastically, making it impossible to even compare effectively. CPU caches are only a few MB in size, which is the bottleneck. Older machines might be even slower in general.
Reason 2: Garbage Collection (GC)
Each time replace is called, a "memory copy" occurs and accumulates in the GC. I thought doing it line-by-line would reduce the total byte volume of memory copies and improve speed, but that was incorrect. In reality, it seems to be an issue of managing too many objects, which leads to performance degradation.
Reason 3: Function Call Overhead
There is also the possibility that function call costs are simply high. This is true in C/C++ as well, where compiled languages can benefit from inlining small function calls. Jumping/calling from nearby memory to a distant location can also cause instruction cache misses.
Grok Proposal: Bulk Processing
In the context of this example, it is possible to perform everything in a single pass using the following method:
function preProcessBest(text) {
return text.replace(
/[\u0026\u003c\u003e'"咒堽崕彌曾溯璢瘦篭艷葢餠麵龜𥡴剝塡頰𠮟おちんこでるもろちん狂]/gu,
(match) => {
switch(match) {
case '\u0026': return '\u0026amp;';
case '\u003c': return '\u0026lt;';
case '\u003e': return '\u0026gt;';
case "'": return '\u0026#39;';
case '"': return '\u0026#34;';
case 'さんぷるA': return 'サンプルA';
case 'さんぷるB': return 'サンプルB';
case '狂': return '狂■[注意]';
default:
// Kanji replacement via Map for speed
return myPrereplaceMap[match] || match;
}
}
);
}
Of course, in actual business applications, things are more complex, so this might not always work as easily.
Premise 1: String.split() + Array.join()
text = text.split('before').join('after');
text = text.split(/before/).join('after');
It is possible to perform bulk replacement this way, but split generates a large number of String objects and an Array with a length equal to the number of replacements, which is inefficient and considered slow.
text = text.replace(/before/g, 'after');
text = text.replaceAll('before', 'after');
Using replace with /g or the relatively new ES2021 replaceAll function seems better.
Generally, in JavaScript, processes completed within a single function (without unnecessary intermediate steps) seem to be internally optimized and faster. The fact that replaceAll was introduced when there was already a way to handle it with /g suggests it was determined to be superior not only in readability but also in processing speed. Let's use replaceAll. Note that replaceAll can also take regular expressions; just keep in mind that if you provide a regex, it must have the g flag, or it will throw a TypeError.
Well, it makes sense that a dedicated function would be faster.
Premise 2: Fundamental Question
Fundamentally, the C-style mindset takes a structure that splits data line-by-line for granted. However, if you can do it in bulk, that might be just fine as well.
Well, it's certainly a form of "opulent programming."
Summary and Conclusion
If you can do it in bulk, it's almost always faster.
Discussion