libvaxis/src/gwidth.zig at main · omkarbhad/libvaxis · GitHub - github.com

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
const std = @import("std");
const unicode = std.unicode;
const testing = std.testing;
const uucode = @import("uucode");

/// the method to use when calculating the width of a grapheme
pub const Method = enum {
    unicode,
    wcwidth,
    no_zwj,
};

/// Calculate width from east asian width property and Unicode properties
fn eawToWidth(cp: u21, eaw: uucode.types.EastAsianWidth) i16 {
    // Based on wcwidth implementation
    // Control characters
    if (cp == 0) return 0;
    if (cp < 32 or (cp >= 0x7f and cp < 0xa0)) return -1;

    // Use general category for comprehensive zero-width detection
    const gc = uucode.get(.general_category, cp);
    switch (gc) {
        .mark_nonspacing, .mark_enclosing => return 0,
        else => {},
    }

    // Additional zero-width characters not covered by general category
    if (cp == 0x00ad) return 0; // soft hyphen
    if (cp == 0x200b) return 0; // zero-width space
    if (cp == 0x200c) return 0; // zero-width non-joiner
    if (cp == 0x200d) return 0; // zero-width joiner
    if (cp == 0x2060) return 0; // word joiner
    if (cp == 0x034f) return 0; // combining grapheme joiner
    if (cp == 0xfeff) return 0; // zero-width no-break space (BOM)
    if (cp >= 0x180b and cp <= 0x180d) return 0; // Mongolian variation selectors
    if (cp >= 0xfe00 and cp <= 0xfe0f) return 0; // variation selectors
    if (cp >= 0xe0100 and cp <= 0xe01ef) return 0; // Plane-14 variation selectors

    // East Asian Width: fullwidth or wide = 2
    // ambiguous in East Asian context = 2, otherwise 1
    // halfwidth, narrow, or neutral = 1
    return switch (eaw) {
        .fullwidth, .wide => 2,
        else => 1,
    };
}

/// returns the width of the provided string, as measured by the method chosen
pub fn gwidth(str: []const u8, method: Method) u16 {
    switch (method) {
        .unicode => {
            var total: u16 = 0;
            var grapheme_iter = uucode.grapheme.Iterator(uucode.utf8.Iterator).init(.init(str));

            var grapheme_start: usize = 0;
            var prev_break: bool = true;

            while (grapheme_iter.nextCodePoint()) |result| {
                if (prev_break and !result.is_break) {
                    // Start of a new grapheme
                    const cp_len: usize = std.unicode.utf8CodepointSequenceLength(result.code_point) catch 1;
                    grapheme_start = grapheme_iter.i - cp_len;
                }

                if (result.is_break) {
                    // End of a grapheme - calculate its width
                    const grapheme_end = grapheme_iter.i;
                    const grapheme_bytes = str[grapheme_start..grapheme_end];

                    // Calculate grapheme width
                    var g_iter = uucode.utf8.Iterator.init(grapheme_bytes);
                    var width: i16 = 0;
                    var has_emoji_vs: bool = false;
                    var has_text_vs: bool = false;
                    var has_emoji_presentation: bool = false;
                    var ri_count: u8 = 0;

                    while (g_iter.next()) |cp| {
                        // Check for emoji variation selector (U+FE0F)
                        if (cp == 0xfe0f) {
                            has_emoji_vs = true;
                            continue;
                        }

                        // Check for text variation selector (U+FE0E)
                        if (cp == 0xfe0e) {
                            has_text_vs = true;
                            continue;
                        }

                        // Check if this codepoint has emoji presentation
                        if (uucode.get(.is_emoji_presentation, cp)) {
                            has_emoji_presentation = true;
                        }

                        // Count regional indicators (for flag emojis)
                        if (cp >= 0x1F1E6 and cp <= 0x1F1FF) {
                            ri_count += 1;
                        }

                        const eaw = uucode.get(.east_asian_width, cp);
                        const w = eawToWidth(cp, eaw);
                        // Take max of non-zero widths
                        if (w > 0 and w > width) width = w;
                    }

                    // Handle variation selectors and emoji presentation
                    if (has_text_vs) {
                        // Text presentation explicit - keep width as-is (usually 1)
                        width = @max(1, width);
                    } else if (has_emoji_vs or has_emoji_presentation or ri_count == 2) {
                        // Emoji presentation or flag pair - force width 2
                        width = @max(2, width);
                    }

                    total += @max(0, width);

                    grapheme_start = grapheme_end;
                }
                prev_break = result.is_break;
            }

            return total;
        },
        .wcwidth => {
            var total: u16 = 0;
            var iter = uucode.utf8.Iterator.init(str);
            while (iter.next()) |cp| {
                const w: i16 = switch (cp) {
                    // undo an override in zg for emoji skintone selectors
                    0x1f3fb...0x1f3ff => 2,
                    else => blk: {
                        const eaw = uucode.get(.east_asian_width, cp);
                        break :blk eawToWidth(cp, eaw);
                    },
                };
                total += @intCast(@max(0, w));
            }
            return total;
        },
        .no_zwj => {
            var iter = std.mem.splitSequence(u8, str, "\u{200D}");
            var result: u16 = 0;
            while (iter.next()) |s| {
                result += gwidth(s, .unicode);
            }
            return result;
        },
    }
}

test "gwidth: a" {
    try testing.expectEqual(1, gwidth("a", .unicode));
    try testing.expectEqual(1, gwidth("a", .wcwidth));
    try testing.expectEqual(1, gwidth("a", .no_zwj));
}

test "gwidth: emoji with ZWJ" {
    try testing.expectEqual(2, gwidth("👩‍🚀", .unicode));
    try testing.expectEqual(4, gwidth("👩‍🚀", .wcwidth));
    try testing.expectEqual(4, gwidth("👩‍🚀", .no_zwj));
}

test "gwidth: emoji with VS16 selector" {
    try testing.expectEqual(2, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .unicode));
    try testing.expectEqual(1, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .wcwidth));
    try testing.expectEqual(2, gwidth("\xE2\x9D\xA4\xEF\xB8\x8F", .no_zwj));
}

test "gwidth: emoji with skin tone selector" {
    try testing.expectEqual(2, gwidth("👋🏿", .unicode));
    try testing.expectEqual(4, gwidth("👋🏿", .wcwidth));
    try testing.expectEqual(2, gwidth("👋🏿", .no_zwj));
}

test "gwidth: zero-width space" {
    try testing.expectEqual(0, gwidth("\u{200B}", .unicode));
    try testing.expectEqual(0, gwidth("\u{200B}", .wcwidth));
}

test "gwidth: zero-width non-joiner" {
    try testing.expectEqual(0, gwidth("\u{200C}", .unicode));
    try testing.expectEqual(0, gwidth("\u{200C}", .wcwidth));
}

test "gwidth: combining marks" {
    // Hebrew combining mark
    try testing.expectEqual(0, gwidth("\u{05B0}", .unicode));
    // Devanagari combining mark
    try testing.expectEqual(0, gwidth("\u{093C}", .unicode));
}

test "gwidth: flag emoji (regional indicators)" {
    // US flag 🇺🇸
    try testing.expectEqual(2, gwidth("🇺🇸", .unicode));
    // UK flag 🇬🇧
    try testing.expectEqual(2, gwidth("🇬🇧", .unicode));
}

test "gwidth: text variation selector" {
    // U+2764 (heavy black heart) + U+FE0E (text variation selector)
    // Should be width 1 with text presentation
    try testing.expectEqual(1, gwidth("❤︎", .unicode));
}

test "gwidth: keycap sequence" {
    // Digit 1 + U+FE0F + U+20E3 (combining enclosing keycap)
    // Should be width 2
    try testing.expectEqual(2, gwidth("1️⃣", .unicode));
}

test "gwidth: base letter with combining mark" {
    // 'a' + combining acute accent (NFD form)
    // Should be width 1 (combining mark is zero-width)
    try testing.expectEqual(1, gwidth("á", .unicode));
}

test {
    std.testing.refAllDecls(@This());
}