#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

static int gd_UTF8_To_Unicode(const char *str, uint32_t *chPtr){
        // reads the contents pointed to by str and outputs a unicode codepage into object pointed to by chPtr
        // return is the bytes to offset for the first char of a subsequent read
	const uint8_t len[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
	uint32_t ch;
	const uint32_t byte = *(uint8_t *)str;
	const uint8_t total = len[byte >> 3];
	int8_t trail = total - 1;
	if (trail > 0) {
		ch = byte & (0x3F >> trail);
		do {
			str++;
			if ((*str & 0xC0) != 0x80) {
				*chPtr = byte;
				return 1;
			}
			ch <<= 6;
			ch |= (*str & 0x3F);
			trail--;
		} while (trail > 0);
		*chPtr = ch;
		return total;
	}
        //should be an error ie: 0 bytes, and exit program
        //1 to avoid endless loop from caller
	if (byte > 0xF7)
		printf("WARNING: utf-8 above 21-bit unicode range\n");

	*chPtr = byte;
	return 1;
}

int main(int argc, char **argv){
	FILE *out_file;
	//FILE *in_file;
	const char *out_file_name = "./all-uni.txt";
	const char *in_file_name = "./all-utf8.txt";
	uint32_t num1 = 0;
	uint32_t unicode;
	uint32_t len = 0;
	//uint8_t byte;
	size_t total_read = 0;
	size_t file_bytes = 0;
	struct stat buffer;
	char *mem_file;

	int input = open(in_file_name, O_RDONLY);
	if(input == -1){
		fprintf( stderr, "Error opening %s\n", in_file_name);
		exit(1);
	}
	int status = fstat(input, &buffer);
	file_bytes=buffer.st_size;
	mem_file = (char *)mmap(0, file_bytes, PROT_READ, MAP_SHARED, input, 0);
	if(mem_file == MAP_FAILED) {
		close(input);
		printf("Error: mmap MAP_FAILED\n");
		exit(1);
	}

	if( ( out_file = fopen(out_file_name, "w" ) ) == NULL ) {
		fprintf( stderr, "Error opening %s\n", out_file_name);
		exit(1);
	}


	while(num1 < file_bytes){
		len = gd_UTF8_To_Unicode((const char *)&mem_file[num1], &unicode);
		num1+=len;
		fprintf(out_file, "%u\n", unicode);
	}


	if(munmap(mem_file, file_bytes) == -1){
		printf("Error: munmap\n");
	}
	fclose(out_file);
	close(input);
	return 0;
}
