Mirai's Miscellaneous Misadventures

M41 / core / text-ualyze.c

// copyright 2023 zamfofex
// license: AGPLv3 or later

// todo: this is really slow!
// it has to recompute a lot of stuff across calls.
// but it's necessary to fit the current API.

// todo: this is currently necessary...
#define char16_t uint16_t

#include <stdlib.h>
#include <ualyze.h>

#include <mimimi/text.h>

static int mimimi_utf16(uint16_t *ints, unsigned char **text)
{
	unsigned long int cp = mimimi_code_point(*text);
	*text = mimimi_skip_code_point(*text);
	
	if (cp < 0x10000)
	{
		ints[0] = cp;
		return 1;
	}
	
	cp -= 0x10000;
	
	unsigned int high = cp >> 10;
	unsigned int low = cp & 0x3FF;
	high += 0xD800;
	low += 0xDC00;
	ints[0] = high;
	ints[1] = low;
	
	return 2;
}

static int mimimi_utf16_count(unsigned char *text)
{
	unsigned long int cp = mimimi_code_point(text);
	uint16_t n[2];
	return mimimi_utf16(n, &text);
}

static int mimimi_utf16_count_text(unsigned char *text)
{
	int count = 0;
	while (*text != 0)
		count += mimimi_utf16_count(text),
		text = mimimi_skip_code_point(text);
	return count;
}

static int mimimi_ual_buffer(ual_buffer **buffer, unsigned char *text)
{
	int count = mimimi_utf16_count_text(text);
	uint16_t *utf16 = NULL;
	if (count > 0)
	{
		utf16 = malloc((count + 1) * sizeof *utf16);
		if (utf16 == NULL) exit(1);
		
		unsigned char *text1 = text;
		uint16_t *utf16x = utf16;
		while (*text1 != 0) utf16x += mimimi_utf16(utf16x, &text1);
		*utf16x = 0;
	}
	
	*buffer = ual_buffer_create();
	if (*buffer == 0) exit(1);
	
	int count2 = ual_analyze_paragraph(*buffer, utf16, count);
	int i = 0;
	while (count2 > 0)
	{
		count2 -= mimimi_utf16_count(text);
		text = mimimi_skip_code_point(text);
		i++;
	}
	return i;
}

static void mimimi_finish_ual_buffer(ual_buffer *buffer)
{
	uint16_t *utf16 = ual_buffer_text(buffer);
	ual_buffer_release(buffer);
	if (utf16 != NULL) free(utf16);
}

int mimimi_count_paragraph(unsigned char *text)
{
	int i = 0;
	for (;;)
	{
		unsigned long int cp = mimimi_code_point(text);
		if (cp == 0) break;
		if (cp == 0x0A) break; if (cp == 0x0B) break; if (cp == 0x0C) break; if (cp == 0x0D) break;
		if (cp == 0x0085) break; if (cp == 0x2028) break; if (cp == 0x2029) break;
		text = mimimi_skip_code_point(text);
		i++;
	}
	return i;
}

unsigned char *mimimi_skip_paragraph(unsigned char *text)
{
	ual_buffer *buffer;
	text += mimimi_ual_buffer(&buffer, text);
	mimimi_finish_ual_buffer(buffer);
	return text;
}

static unsigned char *mimimi_ual_until(unsigned char *text, int flags, int n)
{
	ual_buffer *buffer;
	mimimi_ual_buffer(&buffer, text);
	ual_analyze_breaks(buffer);
	
	ual_char *chars = ual_buffer_chars(buffer);
	
	if (*text != 0 && n != 0)
		chars++,
		text = mimimi_skip_code_point(text);
	
	while (*text != 0 && (chars->bc & flags) == 0)
		chars++,
		text = mimimi_skip_code_point(text);
	
	mimimi_finish_ual_buffer(buffer);
	return text;
}

unsigned char *mimimi_skip_grapheme(unsigned char *text)
{
	return mimimi_ual_until(text, UAL_BREAK_CLUSTER, 1);
}

int mimimi_count_grapheme(unsigned char *text)
{
	return mimimi_skip_grapheme(text) - text;
}

unsigned char *mimimi_skip_word(unsigned char *text)
{
	return mimimi_ual_until(text, UAL_BREAK_LINE, 1);
}

int mimimi_count_word(unsigned char *text)
{
	return mimimi_ual_until(text, UAL_BREAK_LINE | UAL_BREAK_SPACES, 0) - text;
}