#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

static int gd_UTF8_To_Unicode(const char *str, uint32_t *chPtr){
        uint32_t b0 = *(uint8_t *)str;
        uint32_t b1, b2, b3;

        if(b0 < 0xC0){
                *chPtr = b0;
                return 1;
        }
	if(b0 < 0xE0){
                b1 = (uint8_t)str[1];
                if((b1 & 0xC0) == 0x80){
                        *chPtr = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
                        return 2;
                }
                *chPtr = b0;
                return 1;
        }
        if(b0 < 0xF0){
                b1 = (uint8_t)str[1];
                b2 = (uint8_t)str[2];
                if(((b1 & 0xC0) == 0x80) && ((b2 & 0xC0) == 0x80)){
                        *chPtr = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
                        return 3;
                }
                *chPtr = b0;
                return 1;
        }
	if(b0 < 0xF8){
                b1 = (uint8_t)str[1];
                b2 = (uint8_t)str[2];
                b3 = (uint8_t)str[3];
                if(((b1 & 0xC0) == 0x80) && ((b2 & 0xC0) == 0x80) && ((b3 & 0xC0) == 0x80)){
                        *chPtr = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
                        return 4;
                }
                *chPtr = b0;
                return 1;
        }
	printf("WARNING: utf-8 above 21-bit unicode range\n");
	return 1;
}

int main(int argc, char **argv){
	FILE *out_file;
	//FILE *in_file;
	const char *out_file_name = "./all-uni.txt";
	const char *in_file_name = "./all-utf8.txt";
	uint32_t num1 = 0;
	uint32_t unicode;
	uint32_t len = 0;
	//uint8_t byte;
	size_t total_read = 0;
	size_t file_bytes = 0;
	struct stat buffer;
	char *mem_file;

	int input = open(in_file_name, O_RDONLY);
	if(input == -1){
		fprintf( stderr, "Error opening %s\n", in_file_name);
		exit(1);
	}
	int status = fstat(input, &buffer);
	file_bytes=buffer.st_size;
	mem_file = (char *)mmap(0, file_bytes, PROT_READ, MAP_SHARED, input, 0);
	if(mem_file == MAP_FAILED) {
		close(input);
		printf("Error: mmap MAP_FAILED\n");
		exit(1);
	}

	if( ( out_file = fopen(out_file_name, "w" ) ) == NULL ) {
		fprintf( stderr, "Error opening %s\n", out_file_name);
		exit(1);
	}


	while(num1 < file_bytes){
		len = gd_UTF8_To_Unicode((const char *)&mem_file[num1], &unicode);
		num1+=len;
		fprintf(out_file, "%u\n", unicode);
	}


	if(munmap(mem_file, file_bytes) == -1){
		printf("Error: munmap\n");
	}
	fclose(out_file);
	close(input);
	return 0;
}
