iTranslated by AI

The content below is an AI-generated translation. This is an experimental feature, and may contain errors. View original article
📌

[Zig] Converting between characters and code points

に公開

You can also use std.debug.print to examine code points. Surround a single character with single quotes. Note that strings composed of multiple code points cannot be used.

const std = @import("std");
const print = std.debug.print;

pub fn main() !void {
    print("{u}\n", .{ 0x3042 });
    print("U+{X}\n", .{ 'あ' });
    print("{}\n", .{ comptime_int == @TypeOf('あ')});
}

For general byte sequences, use utf8Encode and utf8Decode from std.unicode.

const std = @import("std");
const print = std.debug.print;
const utf8Encode = std.unicode.utf8Encode;
const utf8Decode = std.unicode.utf8Decode;

pub fn main() !void {
     var buf: [4]u8 = undefined;
     _  = try utf8Encode(0x3042, buf[0..]);
     const cp = try utf8Decode("あ");

     print("{s}\n", .{ buf });
     print("U+{X}\n", .{ cp });
}

You can also use an iterator for converting characters into code points.

const std = @import("std");
const print = std.debug.print;
const unicode = std.unicode;

pub fn main() !void {
    const s = "あいうえお";
    var it  = (try unicode.Utf8View.init(s)).iterator();
    const cp  = it.nextCodepoint().?;

    print("U+{X}\n", .{ cp });
}

Note regarding utf8Decode: utf8Decode2, utf8Decode3, and utf8Decode4, which are specific to 2, 3, and 4-byte sequences, are available as public APIs. utf8ByteSequenceLength is a function that returns the character size based on the leading byte.

const std = @import("std");
const print = std.debug.print;
const utf8ByteSequenceLength =  std.unicode.utf8ByteSequenceLength;
const utf8Decode = std.unicode.utf8Decode;
const utf8Decode2 = std.unicode.utf8Decode2;
const utf8Decode3 = std.unicode.utf8Decode3;
const utf8Decode4 = std.unicode.utf8Decode4;

pub fn main() !void {
    // 1 byte
    print("{} bytes ", .{ try utf8ByteSequenceLength("a"[0]) });
    print("U+{X}\n", .{ try utf8Decode("a") });

    // 2 bytes
    print("{} bytes ", .{ try utf8ByteSequenceLength("α"[0]) });
    print("U+{X}\n", .{ try utf8Decode2("α") });

    // 3 bytes
    print("{} bytes ", .{ try utf8ByteSequenceLength("あ"[0]) });
    print("U+{X}\n", .{ try utf8Decode3("あ") });

    // 4 bytes
    print("{} bytes ", .{ try utf8ByteSequenceLength("🐶"[0]) });
    print("U+{X}\n", .{ try utf8Decode4("🐶") });
}

Discussion