title: src/crelude/utf.h

utf.h

Functions

	Name
string	to_string(const byte * c)
runic	utf8_to_ucs4(runic dest, string src)
string	ucs4_to_utf8(string dest, runic src)
string	rune_to_utf8(string dest, rune ch)
usize	byte_offset(string , usize )
usize	char_num(string , usize )
rune	read_rune(string s, usize * i)
u0	next_rune(string , usize * ) Update byte-index to move next rune, skipping it.
u0	prev_rune(string , usize * ) Update byte-index to move to previous rune.
usize	read_escape(string src, rune * dest)
string	escape_rune(string dest, rune ch)
string	utf8_unescape(string dest, string src)
string	utf8_escape(string dest, string src, bool escape_quotes)
string	utf_strchr(string s, rune ch, usize * i)
usize	utf_strlen(string s)
usize	utf_seqlen(string ) Returns length of next UTF-8 sequence.
bool	is_locale_utf8(byte * locale)
bool	is_octal_digit(byte c)
bool	is_hex_digit(byte c)

Defines

	Name
	is_utf(c) Is `c` the start of a UTF-8 sequence?

Functions Documentation

function to_string

static inline string to_string(
    const byte * c
)

Parameters:

c NUL-terminated C-string to be wrapped

Return: UTF-8 string slice.

Wrap C-string to internal string slice. Length does not count the NUL-terminator.

function utf8_to_ucs4

runic utf8_to_ucs4(
    runic dest,
    string src
)

Parameters:

dest Empty runic structure.
src UTF-8 encoded string.

Return: Slice of converted runic dest with correct length.

Convert UTF-8 to UCS-4 (4-byte wide characters) No error checking is done, must be valid UTF-8 and dest must be large enough. If dest.len >= src.len + 1, then there will always be enough space.

function ucs4_to_utf8

string ucs4_to_utf8(
    string dest,
    runic src
)

Parameters:

dest Empty string structure.
src UCS-4/UTF-32 encoded string.

Return: Slice of dest with converted UTF-8 bytes with correct length.

Convert UCS-4 to UTF-8, will try to NUL-terminate, if there is space. No error checking is done. dest.len >= 4 * src.len + 4 To ensure a fit. Ensure enough space in dest string.

function rune_to_utf8

string rune_to_utf8(
    string dest,
    rune ch
)

Parameters:

dest Empty string structure to hold UTF-8 bytes.
ch Single UCS-4 character / rune.

Return: Slice of dest with correct length.

Single UCS-4 rune to UTF-8 string. dest should allocate 4 bytes, or 5 if it is desired to NUL-terminate.

function byte_offset

usize byte_offset(
    string ,
    usize 
)

Character number to byte offset. Given the n-th character/rune in a string, how many bytes is that from the start of a UTF-8 string.

function char_num

usize char_num(
    string ,
    usize 
)

Byte offset to character number. Given a byte-offset from the start of a UTF-8 string, return how many characters/runes precede it.

function read_rune

rune read_rune(
    string s,
    usize * i
)

Parameters:

s String to step through.
i Index with value of current position in the string, updating to the next start-of-character byte.

Return: The character (rune).

Step through a string, one rune/character at a time. Given the previous index, and updating the index to the begining of the next character.

function next_rune

u0 next_rune(
    string ,
    usize * 
)

Update byte-index to move next rune, skipping it.

function prev_rune

u0 prev_rune(
    string ,
    usize * 
)

Update byte-index to move to previous rune.

function read_escape

usize read_escape(
    string src,
    rune * dest
)

Parameters:

src String pointing to char after backslash.
dest Pointer to location where resulting rune is to be stored.

Return: How many bytes read as part of parsing (including u/U). Returns 0 if escape is invalid.

Take an unescaped UTF-8 string, where the start of the string is pointing to the character right after the backslash. If this character is a U, then max eight (8) hexadecimal digits are expected to succeed it, otherwise a u is expected, where max four (4) hexdecimal digits are to succeed it. The resulting rune is stored in dest.

function escape_rune

string escape_rune(
    string dest,
    rune ch
)

Parameters:

dest Empty string, should be large enough for minimum 4 bytes, plus 1 byte for the NUL-terminator.
ch The UCS-4 rune to convert from.

Return: Slice of dest with correct length.

Given a rune, convert it to an ASCII escape sequence.

function utf8_unescape

string utf8_unescape(
    string dest,
    string src
)

Parameters:

dest Empty string structure.
src String containg escapes.

Return: Slice of dest with correct length.

Convert a string containing ASCII escape sequences to a proper UTF-8 string.

function utf8_escape

string utf8_escape(
    string dest,
    string src,
    bool escape_quotes
)

Parameters:

dest Empty string structure.
src String containg valid UTF-8.
escape_quotes If true, quotation-marks will have backslashes prepended too.

Return: Slice of dest string with correct length.

Convert a string containing UTF-8 to ASCII with escape sequences.

function utf_strchr

string utf_strchr(
    string s,
    rune ch,
    usize * i
)

Parameters:

s String to search through.
ch Rune/character to find.
i Pointer to be set to index of character in string.

Return: Slice of s string, starting at first occurence. Points to nil with zero (0) length if no such character is found.

Find first occurrence of character ch in string c.

function utf_strlen

usize utf_strlen(
    string s
)

Counts number of characters (runes) in a UTF-8 string. Not the number of bytes, which is s.len.

function utf_seqlen

usize utf_seqlen(
    string 
)

Returns length of next UTF-8 sequence.

function is_locale_utf8

bool is_locale_utf8(
    byte * locale
)

Give the C-string returned by setlocale, determine whether the current locale speaks UTF-8.

function is_octal_digit

static inline bool is_octal_digit(
    byte c
)

function is_hex_digit

static inline bool is_hex_digit(
    byte c
)

Macros Documentation

define is_utf

#define is_utf(
    c
)
(((c) & 0xC0) != 0x80)

Is c the start of a UTF-8 sequence?

Source code

#include "common.h"
#include <string.h>

#pragma once

#define is_utf(c) (((c) & 0xC0) != 0x80)

static inline
string to_string(const byte *c)
    { return ((string){ .len = strlen(c), .value = (byte *)c }); }

/* Conversions. */

runic utf8_to_ucs4(runic dest, string src);

string ucs4_to_utf8(string dest, runic src);

string rune_to_utf8(string dest, rune ch);

/* Moving through stirngs. */

usize byte_offset(string, usize);

usize char_num(string, usize);

rune read_rune(string s, usize *i);

u0 next_rune(string, usize *);

u0 prev_rune(string, usize *);

/* Unicode Escapes. */

usize read_escape(string src, rune *dest);

string escape_rune(string dest, rune ch);

string utf8_unescape(string dest, string src);

string utf8_escape(string dest, string src, bool escape_quotes);

/* UTF-8 oriented standard function replacements. */

string utf_strchr(string s, rune ch, usize *i);

usize utf_strlen(string s);

usize utf_seqlen(string);

bool is_locale_utf8(byte *locale);

/* Utility. */
static inline bool is_octal_digit(byte c)
{
    return (c >= '0' && c <= '7');
}

static inline bool is_hex_digit(byte c)
{
    return ((c >= '0' && c <= '9')
         || (c >= 'A' && c <= 'F')
         || (c >= 'a' && c <= 'f'));
}

Updated on 23 August 2022 at 00:54:19 UTC

Crelude Documentation