/* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
 * stf-parse.c : Structured Text Format parser. (STF)
 *               A general purpose engine for parsing data
 *               in CSV and Fixed width format.
 *
 *
 * Copyright (C) Almer. S. Tigelaar.
 * EMail: almer1@dds.nl or almer-t@bigfoot.com
 *
 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include <gnumeric-config.h>
#include <glib/gi18n-lib.h>
#include "gnumeric.h"
#include "stf-parse.h"
#include "stf-export.h"

#include "workbook.h"
#include "cell.h"
#include "sheet.h"
#include "expr.h"
#include "clipboard.h"
#include "sheet-style.h"
#include "value.h"
#include "mstyle.h"
#include "number-match.h"
#include "gutils.h"
#include "parse-util.h"
#include <goffice/goffice.h>

#include <stdlib.h>
#include <locale.h>
#include <string.h>

#define SETUP_LOCALE_SWITCH char *oldlocale = NULL

#define START_LOCALE_SWITCH if (parseoptions->locale) {\
oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
go_setlocale(LC_ALL, parseoptions->locale);}

#define END_LOCALE_SWITCH if (oldlocale) {\
go_setlocale(LC_ALL, oldlocale);\
g_free (oldlocale);}

/* Source_t struct, used for interchanging parsing information between the low level parse functions */
typedef struct {
	GStringChunk *chunk;
	char const *position;  /* Indicates the current position within data */

	/* Used internally for fixed width parsing */
	int splitpos;          /* Indicates current position in splitpositions array */
	int linepos;           /* Position on the current line */
} Source_t;

/* Struct used for autodiscovery */
typedef struct {
	int start;
	int stop;
} AutoDiscovery_t;

/*
 * Some silly dude make the length field an unsigned int.  C just does
 * not deal very well with that.
 */
static inline int
my_garray_len (GArray const *a)
{
	return (int)a->len;
}

static char *
my_utf8_strchr (const char *p, gunichar uc)
{
	return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
}

static int
compare_terminator (char const *s, StfParseOptions_t *parseoptions)
{
	guchar const *us = (guchar const *)s;
	GSList *l;

	if (*us > parseoptions->compiled_terminator.max ||
	    *us < parseoptions->compiled_terminator.min)
		return 0;

	for (l = parseoptions->terminator; l; l = l->next) {
		char const *term = l->data;
		char const *d = s;

		while (*term) {
			if (*d != *term)
				goto next;
			term++;
			d++;
		}
		return d - s;

	next:
		;
	}
	return 0;
}


/*******************************************************************************************************
 * STF PARSE OPTIONS : StfParseOptions related
 *******************************************************************************************************/

/**
 * stf_parse_options_new:
 *
 * This will return a new StfParseOptions_t struct.
 * The struct should, after being used, freed with stf_parse_options_free.
 **/
static StfParseOptions_t *
stf_parse_options_new (void)
{
	StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);

	parseoptions->parsetype   = PARSE_TYPE_NOTSET;

	parseoptions->terminator  = NULL;
	stf_parse_options_add_line_terminator (parseoptions, "\r\n");
	stf_parse_options_add_line_terminator (parseoptions, "\n");
	stf_parse_options_add_line_terminator (parseoptions, "\r");

	parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
	parseoptions->locale = NULL;

	parseoptions->splitpositions = NULL;
	stf_parse_options_fixed_splitpositions_clear (parseoptions);

	parseoptions->stringindicator = '"';
	parseoptions->indicator_2x_is_single = TRUE;
	parseoptions->sep.duplicates = FALSE;
	parseoptions->trim_seps = FALSE;

	parseoptions->sep.str = NULL;
	parseoptions->sep.chr = NULL;

	parseoptions->col_autofit_array = NULL;
	parseoptions->col_import_array = NULL;
	parseoptions->col_import_array_len = 0;
	parseoptions->formats = NULL;

	parseoptions->cols_exceeded = FALSE;
	parseoptions->rows_exceeded = FALSE;
	parseoptions->ref_count = 1;

	return parseoptions;
}

/**
 * stf_parse_options_free:
 *
 * will free @parseoptions, note that this will not free the splitpositions
 * member (GArray) of the struct, the caller is responsible for that.
 **/
void
stf_parse_options_free (StfParseOptions_t *parseoptions)
{
	g_return_if_fail (parseoptions != NULL);

	if (parseoptions->ref_count-- > 1)
		return;

	g_free (parseoptions->col_import_array);
	g_free (parseoptions->col_autofit_array);
	g_free (parseoptions->locale);
	g_free (parseoptions->sep.chr);

	if (parseoptions->sep.str) {
		GSList *l;

		for (l = parseoptions->sep.str; l != NULL; l = l->next)
			g_free ((char *) l->data);
		g_slist_free (parseoptions->sep.str);
	}

	g_array_free (parseoptions->splitpositions, TRUE);

	stf_parse_options_clear_line_terminator (parseoptions);

	if (parseoptions->formats) {
		unsigned int ui;
		GPtrArray *formats = parseoptions->formats;

		for (ui = 0; ui < formats->len; ui++)
			go_format_unref (g_ptr_array_index (formats, ui));
		g_ptr_array_free (formats, TRUE);
		parseoptions->formats = NULL;
	}

	g_free (parseoptions);
}

static StfParseOptions_t *
stf_parse_options_ref (StfParseOptions_t *parseoptions)
{
	parseoptions->ref_count++;
	return parseoptions;
}

GType
stf_parse_options_get_type (void)
{
	static GType t = 0;

	if (t == 0) {
		t = g_boxed_type_register_static ("StfParseOptions_t",
			 (GBoxedCopyFunc)stf_parse_options_ref,
			 (GBoxedFreeFunc)stf_parse_options_free);
	}
	return t;
}

void
stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
{
	g_return_if_fail (parseoptions != NULL);
	g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);

	parseoptions->parsetype = parsetype;
}

static gint
long_string_first (gchar const *a, gchar const *b)
{
	/* This actually is UTF-8 safe.  */
	return strlen (b) - strlen (a);
}

static void
compile_terminators (StfParseOptions_t *parseoptions)
{
	GSList *l;
	GO_SLIST_SORT (parseoptions->terminator, (GCompareFunc)long_string_first);

	parseoptions->compiled_terminator.min = 255;
	parseoptions->compiled_terminator.max = 0;
	for (l = parseoptions->terminator; l; l = l->next) {
		const guchar *term = l->data;
		parseoptions->compiled_terminator.min =
			MIN (parseoptions->compiled_terminator.min, *term);
		parseoptions->compiled_terminator.max =
			MAX (parseoptions->compiled_terminator.max, *term);
	}
}

/**
 * stf_parse_options_add_line_terminator:
 *
 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
 * this indicates the end of a row.
 *
 **/
void
stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
{
	g_return_if_fail (parseoptions != NULL);
	g_return_if_fail (terminator != NULL && *terminator != 0);

	GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
	compile_terminators (parseoptions);
}

/**
 * stf_parse_options_clear_line_terminator:
 *
 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
 * this indicates the end of a row.
 *
 **/
void
stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
{
	g_return_if_fail (parseoptions != NULL);

	g_slist_free_full (parseoptions->terminator, g_free);
	parseoptions->terminator = NULL;
	compile_terminators (parseoptions);
}

/**
 * stf_parse_options_set_trim_spaces:
 *
 * If enabled will trim spaces in every parsed field on left and/or right
 * sides.
 **/
void
stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
{
	g_return_if_fail (parseoptions != NULL);

	parseoptions->trim_spaces = trim_spaces;
}

/**
 * stf_parse_options_csv_set_separators:
 * @parseoptions: #StfParseOptions_t
 * @character:
 * @string: (element-type char):
 *
 * A copy is made of the parameters.
 **/
void
stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions, char const *character,
				      GSList const *string)
{
	g_return_if_fail (parseoptions != NULL);

	g_free (parseoptions->sep.chr);
	parseoptions->sep.chr = g_strdup (character);

	g_slist_free_full (parseoptions->sep.str, g_free);
	parseoptions->sep.str = go_slist_map (string, (GOMapFunc)g_strdup);
}

void
stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
{
	g_return_if_fail (parseoptions != NULL);

	parseoptions->stringindicator = stringindicator;
}

/**
 * stf_parse_options_csv_set_indicator_2x_is_single:
 * @indic_2x: a boolean value indicating whether we want to see two
 *		adjacent string indicators as a single string indicator
 *		that is part of the cell, rather than a terminator.
 **/
void
stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
						  gboolean const indic_2x)
{
	g_return_if_fail (parseoptions != NULL);

	parseoptions->indicator_2x_is_single = indic_2x;
}

/**
 * stf_parse_options_csv_set_duplicates:
 * @parseoptions:
 * @duplicates: a boolean value indicating whether we want to see two
 *               separators right behind each other as one
 **/
void
stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
{
	g_return_if_fail (parseoptions != NULL);

	parseoptions->sep.duplicates = duplicates;
}

/**
 * stf_parse_options_csv_set_trim_seps:
 * @trim_seps: a boolean value indicating whether we want to ignore
 *               separators at the beginning of lines
 **/
void
stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
{
	g_return_if_fail (parseoptions != NULL);

	parseoptions->trim_seps = trim_seps;
}

/**
 * stf_parse_options_fixed_splitpositions_clear:
 *
 * This will clear the splitpositions (== points on which a line is split)
 **/
void
stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
{
	int minus_one = -1;
	g_return_if_fail (parseoptions != NULL);

	if (parseoptions->splitpositions)
		g_array_free (parseoptions->splitpositions, TRUE);
	parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));

	g_array_append_val (parseoptions->splitpositions, minus_one);
}

/**
 * stf_parse_options_fixed_splitpositions_add:
 *
 * @position will be added to the splitpositions.
 **/
void
stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
{
	unsigned int ui;

	g_return_if_fail (parseoptions != NULL);
	g_return_if_fail (position >= 0);

	for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
		int here = g_array_index (parseoptions->splitpositions, int, ui);
		if (position == here)
			return;
		if (position < here)
			break;
	}

	g_array_insert_val (parseoptions->splitpositions, ui, position);
}

void
stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
{
	unsigned int ui;

	g_return_if_fail (parseoptions != NULL);
	g_return_if_fail (position >= 0);

	for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
		int here = g_array_index (parseoptions->splitpositions, int, ui);
		if (position == here)
			g_array_remove_index (parseoptions->splitpositions, ui);
		if (position <= here)
			return;
	}
}

int
stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
{
	return parseoptions->splitpositions->len;
}

int
stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
{
	return g_array_index (parseoptions->splitpositions, int, n);
}


/**
 * stf_parse_options_valid:
 * @parseoptions: an import options struct
 *
 * Checks if @parseoptions is correctly filled
 *
 * returns : TRUE if it is correctly filled, FALSE otherwise.
 **/
static gboolean
stf_parse_options_valid (StfParseOptions_t *parseoptions)
{
	g_return_val_if_fail (parseoptions != NULL, FALSE);

	if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
		if (!parseoptions->splitpositions) {
			g_warning ("STF: No splitpositions in struct");
			return FALSE;
		}
	}

	return TRUE;
}

/*******************************************************************************************************
 * STF PARSE : The actual routines that do the 'trick'
 *******************************************************************************************************/

static void
trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
{
	if (!field) return;

	if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
		char *s = field;

		while (g_unichar_isspace (g_utf8_get_char (s)))
			s = g_utf8_next_char (s);

		if (s != field)
			memmove (field, s, 1 + strlen (s));
	}

	if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
		char *s = field + strlen (field);

		while (field != s) {
			s = g_utf8_prev_char (s);
			if (!g_unichar_isspace (g_utf8_get_char (s)))
				break;
			*s = 0;
		}
	}
}

/**
 * stf_parse_csv_is_separator:
 *
 * returns NULL if @character is not a separator, a pointer to the character
 * after the separator otherwise.
 **/
static char const *
stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
{
	g_return_val_if_fail (character != NULL, NULL);

	if (*character == 0)
		return NULL;

	if (str) {
		GSList const *l;

		for (l = str; l != NULL; l = l->next) {
			char const *s = l->data;
			char const *r;
			glong cnt;
			glong const len = g_utf8_strlen (s, -1);

			/* Don't compare past the end of the buffer! */
			for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
				if (*r == '\0')
					break;

			if ((cnt == len) && (memcmp (character, s, len) == 0))
				return g_utf8_offset_to_pointer (character, len);
		}
	}

	if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
		return g_utf8_next_char(character);

	return NULL;
}

/*
 * stf_parse_eat_separators:
 *
 * skip over leading separators
 *
 */

static void
stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
{
	char const *cur, *next;

	g_return_if_fail (src != NULL);
        g_return_if_fail (parseoptions != NULL);

	cur = src->position;

	if (*cur == '\0' || compare_terminator (cur, parseoptions))
		return;
	while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
		cur = next;
	src->position = cur;
	return;
}


typedef enum {
	STF_CELL_ERROR,
	STF_CELL_EOF,
	STF_CELL_EOL,
	STF_CELL_FIELD_NO_SEP,
	STF_CELL_FIELD_SEP
} StfParseCellRes;

static StfParseCellRes
stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
{
	char const *cur;
	gboolean saw_sep = FALSE;

	g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
	g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);

	cur = src->position;
	g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);

	/* Skip whitespace, but stop at line terminators.  */
	while (1) {
		int term_len;

		if (*cur == 0) {
			src->position = cur;
			return STF_CELL_EOF;
		}

		term_len = compare_terminator (cur, parseoptions);
		if (term_len) {
			src->position = cur + term_len;
			return STF_CELL_EOL;
		}

		if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
			break;

		if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
						parseoptions->sep.str))
			break;

		if (!g_unichar_isspace (g_utf8_get_char (cur)))
			break;
		cur = g_utf8_next_char (cur);
	}

	if (parseoptions->stringindicator != 0 &&
	    g_utf8_get_char (cur) == parseoptions->stringindicator) {
		cur = g_utf8_next_char (cur);
		while (*cur) {
			gunichar uc = g_utf8_get_char (cur);
			cur = g_utf8_next_char (cur);

			if (uc == parseoptions->stringindicator) {
				if (parseoptions->indicator_2x_is_single &&
				    g_utf8_get_char (cur) == parseoptions->stringindicator)
					cur = g_utf8_next_char (cur);
				else {
					/* "field content"dropped-garbage,  */
					while (*cur && !compare_terminator (cur, parseoptions)) {
						char const *post = stf_parse_csv_is_separator
							(cur, parseoptions->sep.chr, parseoptions->sep.str);
						if (post) {
							cur = post;
							saw_sep = TRUE;
							break;
						}
						cur = g_utf8_next_char (cur);
					}
					break;
				}
			}

			g_string_append_unichar (text, uc);
		}

		/* We silently allow a missing terminating quote.  */
	} else {
		/* Unquoted field.  */

		while (*cur && !compare_terminator (cur, parseoptions)) {

			char const *post = stf_parse_csv_is_separator
				(cur, parseoptions->sep.chr, parseoptions->sep.str);
			if (post) {
				cur = post;
				saw_sep = TRUE;
				break;
			}

			g_string_append_unichar (text, g_utf8_get_char (cur));
			cur = g_utf8_next_char (cur);
		}

		if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
			while (text->len) {
				const char *last = g_utf8_prev_char (text->str + text->len);
				if (!g_unichar_isspace (g_utf8_get_char (last)))
					break;
				g_string_truncate (text, last - text->str);
			}
		}
	}

	src->position = cur;

	if (saw_sep && parseoptions->sep.duplicates)
		stf_parse_eat_separators (src, parseoptions);

	return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
}

/**
 * stf_parse_csv_line:
 *
 * This will parse one line from the current @src->position.
 * NOTE: The calling routine is responsible for freeing the result.
 *
 * returns : a GPtrArray of char*'s
 **/
static GPtrArray *
stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
{
	GPtrArray *line;
	gboolean cont = FALSE;
	GString *text;

	g_return_val_if_fail (src != NULL, NULL);
	g_return_val_if_fail (parseoptions != NULL, NULL);

	line = g_ptr_array_new ();
	if (parseoptions->trim_seps)
		stf_parse_eat_separators (src, parseoptions);

	text = g_string_sized_new (30);

	while (1) {
		char *ctext;
		StfParseCellRes res =
			stf_parse_csv_cell (text, src, parseoptions);
		trim_spaces_inplace (text->str, parseoptions);
		ctext = g_string_chunk_insert_len (src->chunk,
						   text->str, text->len);
		g_string_truncate (text, 0);

		switch (res) {
		case STF_CELL_FIELD_NO_SEP:
			g_ptr_array_add (line, ctext);
			cont = FALSE;
			break;

		case STF_CELL_FIELD_SEP:
			g_ptr_array_add (line, ctext);
			cont = TRUE;  /* Make sure we see one more field.  */
			break;

		default:
			if (cont)
				g_ptr_array_add (line, ctext);
			g_string_free (text, TRUE);
			return line;
		}
	}
}

/**
 * stf_parse_fixed_cell:
 *
 * returns a pointer to the parsed cell contents.
 **/
static char *
stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
{
	char *res;
	char const *cur;
	int splitval;

	g_return_val_if_fail (src != NULL, NULL);
	g_return_val_if_fail (parseoptions != NULL, NULL);

	cur = src->position;

	if (src->splitpos < my_garray_len (parseoptions->splitpositions))
		splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
	else
		splitval = -1;

	while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
		src->linepos++;
		cur = g_utf8_next_char (cur);
	}

	res = g_string_chunk_insert_len (src->chunk,
					 src->position,
					 cur - src->position);

	src->position = cur;

	return res;
}

/**
 * stf_parse_fixed_line:
 *
 * This will parse one line from the current @src->position.
 * It will return a GPtrArray with the cell contents as strings.

 * NOTE: The calling routine is responsible for freeing result.
 **/
static GPtrArray *
stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
{
	GPtrArray *line;

	g_return_val_if_fail (src != NULL, NULL);
	g_return_val_if_fail (parseoptions != NULL, NULL);

	src->linepos = 0;
	src->splitpos = 0;

	line = g_ptr_array_new ();
	while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
		char *field = stf_parse_fixed_cell (src, parseoptions);

		trim_spaces_inplace (field, parseoptions);
		g_ptr_array_add (line, field);

		src->splitpos++;
	}

	while (line->len < parseoptions->splitpositions->len)
		g_ptr_array_add (line, g_strdup (""));

	return line;
}

/**
 * stf_parse_general_free: (skip)
 */
void
stf_parse_general_free (GPtrArray *lines)
{
	unsigned lineno;
	for (lineno = 0; lineno < lines->len; lineno++) {
		GPtrArray *line = g_ptr_array_index (lines, lineno);
		/* Fields are not freed here.  */
		if (line)
			g_ptr_array_free (line, TRUE);
	}
	g_ptr_array_free (lines, TRUE);
}


/**
 * stf_parse_general: (skip)
 *
 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
 * GPtrArray of strings.
 *
 * The caller must free this entire structure, for example by calling
 * stf_parse_general_free.
 **/
GPtrArray *
stf_parse_general (StfParseOptions_t *parseoptions,
		   GStringChunk *lines_chunk,
		   char const *data, char const *data_end)
{
	GPtrArray *lines;
	Source_t src;
	int row;
	char const *valid_end = data_end;

	g_return_val_if_fail (parseoptions != NULL, NULL);
	g_return_val_if_fail (data != NULL, NULL);
	g_return_val_if_fail (data_end != NULL, NULL);
	g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
	g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);

	src.chunk = lines_chunk;
	src.position = data;
	row = 0;

	if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
		/* Skip over byte-order mark */
		src.position += 3;
	}

	lines = g_ptr_array_new ();
	while (*src.position != '\0' && src.position < data_end) {
		GPtrArray *line;

		if (row == GNM_MAX_ROWS) {
			parseoptions->rows_exceeded = TRUE;
			break;
		}

		line = parseoptions->parsetype == PARSE_TYPE_CSV
			? stf_parse_csv_line (&src, parseoptions)
			: stf_parse_fixed_line (&src, parseoptions);

		g_ptr_array_add (lines, line);
		if (parseoptions->parsetype != PARSE_TYPE_CSV)
			src.position += compare_terminator (src.position, parseoptions);
		row++;
	}

	return lines;
}

/**
 * stf_parse_lines: (skip)
 * @parseoptions: #StfParseOptions_t
 * @lines_chunk:
 * @data:
 * @maxlines:
 * @with_lineno:
 *
 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
 * GPtrArray of strings.
 *
 * The caller must free this entire structure, for example by calling
 * stf_parse_general_free.
 **/
GPtrArray *
stf_parse_lines (StfParseOptions_t *parseoptions,
		 GStringChunk *lines_chunk,
		 char const *data,
		 int maxlines, gboolean with_lineno)
{
	GPtrArray *lines;
	int lineno = 1;

	g_return_val_if_fail (data != NULL, NULL);

	lines = g_ptr_array_new ();
	while (*data) {
		char const *data0 = data;
		GPtrArray *line = g_ptr_array_new ();

		if (with_lineno) {
			char buf[4 * sizeof (int)];
			sprintf (buf, "%d", lineno);
			g_ptr_array_add (line,
					 g_string_chunk_insert (lines_chunk, buf));
		}

		while (1) {
			int termlen = compare_terminator (data, parseoptions);
			if (termlen > 0 || *data == 0) {
				g_ptr_array_add (line,
						 g_string_chunk_insert_len (lines_chunk,
									    data0,
									    data - data0));
				data += termlen;
				break;
			} else
				data = g_utf8_next_char (data);
		}

		g_ptr_array_add (lines, line);

		lineno++;
		if (lineno >= maxlines)
			break;
	}
	return lines;
}

char const *
stf_parse_find_line (StfParseOptions_t *parseoptions,
		     char const *data,
		     int line)
{
	while (line > 0) {
		int termlen = compare_terminator (data, parseoptions);
		if (termlen > 0) {
			data += termlen;
			line--;
		} else if (*data == 0) {
			return data;
		} else {
			data = g_utf8_next_char (data);
		}
	}
	return data;
}


/**
 * stf_parse_options_fixed_autodiscover:
 * @parseoptions: a Parse options struct.
 * @data: The actual data.
 * @data_end: data end.
 *
 * Automatically try to discover columns in the text to be parsed.
 * We ignore empty lines (only containing parseoptions->terminator)
 *
 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
 *        Think hard of a better more flexible solution...
 **/
void
stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
				      char const *data, char const *data_end)
{
	char const *iterator = data;
	GSList *list = NULL;
	GSList *list_start = NULL;
	int lines = 0;
	int effective_lines = 0;
	int max_line_length = 0;
	int *line_begin_hits = NULL;
	int *line_end_hits = NULL;
	int i;

	stf_parse_options_fixed_splitpositions_clear (parseoptions);

	/*
	 * First take a look at all possible white space combinations
	 */
	while (*iterator && iterator < data_end) {
		gboolean begin_recorded = FALSE;
		AutoDiscovery_t *disc = NULL;
		int position = 0;
		int termlen = 0;

		while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
			if (!begin_recorded && *iterator == ' ') {
				disc = g_new0 (AutoDiscovery_t, 1);

				disc->start = position;

				begin_recorded = TRUE;
			} else if (begin_recorded && *iterator != ' ') {
				disc->stop = position;
				list = g_slist_prepend (list, disc);

				begin_recorded = FALSE;
				disc = NULL;
			}

			position++;
			iterator++;
		}

		if (position > max_line_length)
			max_line_length = position;

		/*
		 * If there are excess spaces at the end of
		 * the line : ignore them
		 */
		g_free (disc);

		/*
		 * Hop over the terminator
		 */
		iterator += termlen;

		if (position != 0)
			effective_lines++;

		lines++;
	}

	list       = g_slist_reverse (list);
	list_start = list;

	/*
	 * Kewl stuff :
	 * Look at the number of hits at each line position
	 * if the number of hits equals the number of lines
	 * we can be pretty sure this is the start or end
	 * of a column, we filter out empty columns
	 * later
	 */
	line_begin_hits = g_new0 (int, max_line_length + 1);
	line_end_hits   = g_new0 (int, max_line_length + 1);

	while (list) {
		AutoDiscovery_t *disc = list->data;

		line_begin_hits[disc->start]++;
		line_end_hits[disc->stop]++;

		g_free (disc);

		list = g_slist_next (list);
	}
	g_slist_free (list_start);

	for (i = 0; i < max_line_length + 1; i++)
		if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
			stf_parse_options_fixed_splitpositions_add (parseoptions, i);

	/*
	 * Do some corrections to the initial columns
	 * detected here, we obviously don't need to
	 * do this if there are no columns at all.
	 */
	if (my_garray_len (parseoptions->splitpositions) > 0) {
		/*
		 * Try to find columns that look like :
		 *
		 * Example     100
		 * Example2      9
		 *
		 * (In other words : Columns with left & right justification with
		 *  a minimum of 2 spaces in the middle)
		 * Split these columns in 2
		 */

		for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
			int begin = g_array_index (parseoptions->splitpositions, int, i);
			int end   = g_array_index (parseoptions->splitpositions, int, i + 1);
			int num_spaces   = -1;
			int spaces_start = 0;
			gboolean right_aligned = TRUE;
			gboolean left_aligned  = TRUE;
			gboolean has_2_spaces  = TRUE;

			iterator = data;
			lines = 0;
			while (*iterator && iterator < data_end) {
				gboolean trigger = FALSE;
				gboolean space_trigger = FALSE;
				int pos = 0;

				num_spaces   = -1;
				spaces_start = 0;
				while (*iterator && !compare_terminator (iterator, parseoptions)) {
					if (pos == begin) {
						if (*iterator == ' ')
							left_aligned = FALSE;

						trigger = TRUE;
					} else if (pos == end - 1) {
						if (*iterator == ' ')
							right_aligned = FALSE;

						trigger = FALSE;
					}

					if (trigger || pos == end - 1) {
						if (!space_trigger && *iterator == ' ') {
							space_trigger = TRUE;
							spaces_start = pos;
						} else if (space_trigger && *iterator != ' ') {
							space_trigger = FALSE;
							num_spaces = pos - spaces_start;
						}
					}

					iterator++;
					pos++;
				}

				if (num_spaces < 2)
					has_2_spaces = FALSE;

				if (*iterator)
					iterator++;

				lines++;
			}

			/*
			 * If this column meets all the criteria
			 * split it into two at the last measured
			 * spaces_start + num_spaces
			 */
			if (has_2_spaces && right_aligned && left_aligned) {
				int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;

				g_array_insert_val (parseoptions->splitpositions, i + 1, val);

				/*
				 * Skip over the inserted column
				 */
				i++;
			}
		}

		/*
		 * Remove empty columns here if needed
		 */
		for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
			int begin = g_array_index (parseoptions->splitpositions, int, i);
			int end = g_array_index (parseoptions->splitpositions, int, i + 1);
			gboolean only_spaces = TRUE;

			iterator = data;
			lines = 0;
			while (*iterator && iterator < data_end) {
				gboolean trigger = FALSE;
				int pos = 0;

				while (*iterator && !compare_terminator (iterator, parseoptions)) {
					if (pos == begin)
						trigger = TRUE;
					else if (pos == end)
						trigger = FALSE;

					if (trigger) {
						if (*iterator != ' ')
							only_spaces = FALSE;
					}

					iterator++;
					pos++;
				}

				if (*iterator)
					iterator++;

				lines++;
			}

			/*
			 * The column only contains spaces
			 * remove it
			 */
			if (only_spaces) {
				g_array_remove_index (parseoptions->splitpositions, i);

				/*
				 * We HAVE to make sure that the next column (end) also
				 * gets checked out. If we don't decrease "i" here, we
				 * will skip over it as the indexes shift down after
				 * the removal
				 */
				i--;
			}
		}
	}

	g_free (line_begin_hits);
	g_free (line_end_hits);
}

/*******************************************************************************************************
 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
 *               functions into something meaningful (== application specific)
 *******************************************************************************************************/

/*
 * This is more or less as gnm_cell_set_text, except...
 * 1. Unknown names are not allowed.
 * 2. Only '=' can start an expression.
 */

static void
stf_cell_set_text (GnmCell *cell, char const *text)
{
	GnmExprTop const *texpr;
	GnmValue *val;
	GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
	const GODateConventions *date_conv =
		workbook_date_conv (cell->base.sheet->workbook);

	if (*text == '=' && text[1] != 0) {
		GnmExprParseFlags flags =
			GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
		const char *expr_start = text + 1;
		GnmParsePos pos;
		val = NULL;
		parse_pos_init_cell (&pos, cell);
		texpr = gnm_expr_parse_str (expr_start, &pos, flags,
					    NULL, NULL);
	} else {
		texpr = NULL;
		val = format_match (text, fmt, date_conv);
	}

	if (!val && !texpr)
		val = value_new_string (text);

	if (val)
		gnm_cell_set_value (cell, val);
	else {
		gnm_cell_set_expr (cell, texpr);
		gnm_expr_top_unref (texpr);
	}
}

static void
stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
{
	if (po->parsetype == PARSE_TYPE_CSV) {
		GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
		char quote[6];
		int length = g_unichar_to_utf8 (po->stringindicator, quote);
		if (length > 5) {
			quote[0] = '"';
			quote[1] = '\0';
		} else quote[length] = '\0';

		g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);

		if ((po->terminator != NULL) &&  (po->terminator->data != NULL))
			g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
	}
}

gboolean
stf_parse_sheet (StfParseOptions_t *parseoptions,
		 char const *data, char const *data_end,
		 Sheet *sheet, int start_col, int start_row)
{
	int row;
	unsigned int lrow;
	GStringChunk *lines_chunk;
	GPtrArray *lines;
	gboolean result = TRUE;
	int col;
	unsigned int lcol;

	SETUP_LOCALE_SWITCH;

	g_return_val_if_fail (parseoptions != NULL, FALSE);
	g_return_val_if_fail (data != NULL, FALSE);
	g_return_val_if_fail (IS_SHEET (sheet), FALSE);

	if (!data_end)
		data_end = data + strlen (data);

	lines_chunk = g_string_chunk_new (100 * 1024);
	lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
	if (lines == NULL)
		result = FALSE;

	START_LOCALE_SWITCH;
	for (row = start_row, lrow = 0;
	     result && lrow < lines->len;
	     row++, lrow++) {
		GPtrArray *line;

		if (row >= gnm_sheet_get_max_rows (sheet)) {
			if (!parseoptions->rows_exceeded) {
				/* FIXME: What locale?  */
				g_warning (_("There are more rows of data than "
					     "there is room for in the sheet.  Extra "
					     "rows will be ignored."));
				parseoptions->rows_exceeded = TRUE;
			}
			break;
		}

		col = start_col;
		line = g_ptr_array_index (lines, lrow);

		for (lcol = 0; lcol < line->len; lcol++) {
			gboolean want_col =
				(parseoptions->col_import_array == NULL ||
				 parseoptions->col_import_array_len <= lcol ||
				 parseoptions->col_import_array[lcol]);
			if (!want_col)
				continue;

			if (col >= gnm_sheet_get_max_cols (sheet)) {
				if (!parseoptions->cols_exceeded) {
					/* FIXME: What locale?  */
					g_warning (_("There are more columns of data than "
						     "there is room for in the sheet.  Extra "
						     "columns will be ignored."));
					parseoptions->cols_exceeded = TRUE;
				}
				break;
			} else {
				char const *text = g_ptr_array_index (line, lcol);
				if (text && *text) {
					GnmCell *cell = sheet_cell_fetch (sheet, col, row);
					stf_cell_set_text (cell, text);
				}
			}
			col++;
		}

		g_ptr_array_index (lines, lrow) = NULL;
		g_ptr_array_free (line, TRUE);
	}
	END_LOCALE_SWITCH;

	for (lcol = 0, col = start_col;
	     lcol < parseoptions->col_import_array_len  && col < gnm_sheet_get_max_cols (sheet);
	     lcol++) {
		if (parseoptions->col_import_array == NULL ||
		    parseoptions->col_import_array_len <= lcol ||
		    parseoptions->col_import_array[lcol]) {
			if (parseoptions->col_autofit_array == NULL ||
			    parseoptions->col_autofit_array[lcol]) {
				ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
				ColRowStateGroup  *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
				colrow_index_list_destroy (list);
				g_slist_free (state);
			}
			col++;
		}
	}

	g_string_chunk_free (lines_chunk);
	if (lines)
		stf_parse_general_free (lines);
	if (result)
		stf_read_remember_settings (sheet->workbook, parseoptions);
	return result;
}

GnmCellRegion *
stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
		  Workbook const *wb)
{
	static GODateConventions const default_conv = {FALSE};
	GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;

	GnmCellRegion *cr;
	unsigned int row, colhigh = 0;
	GStringChunk *lines_chunk;
	GPtrArray *lines;

	SETUP_LOCALE_SWITCH;

	g_return_val_if_fail (parseoptions != NULL, NULL);
	g_return_val_if_fail (data != NULL, NULL);

	START_LOCALE_SWITCH;

	cr = gnm_cell_region_new (NULL);

	if (!data_end)
		data_end = data + strlen (data);
	lines_chunk = g_string_chunk_new (100 * 1024);
	lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
	for (row = 0; row < lines->len; row++) {
		GPtrArray *line = g_ptr_array_index (lines, row);
		unsigned int col, targetcol = 0;
		for (col = 0; col < line->len; col++) {
			if (parseoptions->col_import_array == NULL ||
			    parseoptions->col_import_array_len <= col ||
			    parseoptions->col_import_array[col]) {
				const char *text = g_ptr_array_index (line, col);
				if (text) {
					GOFormat *fmt = NULL;
					GnmValue *v;
					GnmCellCopy *cc;

					if (col < parseoptions->formats->len)
						fmt = g_ptr_array_index (parseoptions->formats, col);
					v = format_match (text, fmt, date_conv);
					if (!v)
						v = value_new_string (text);

					cc = gnm_cell_copy_new (cr, targetcol, row);
					cc->val  = v;
					cc->texpr = NULL;
					targetcol++;
					if (targetcol > colhigh)
						colhigh = targetcol;
				}
			}
		}
	}
	stf_parse_general_free (lines);
	g_string_chunk_free (lines_chunk);

	END_LOCALE_SWITCH;

	cr->cols    = (colhigh > 0) ? colhigh : 1;
	cr->rows    = row;

	return cr;
}

static int
int_sort (void const *a, void const *b)
{
	return *(int const *)a - *(int const *)b;
}

static int
count_character (GPtrArray *lines, gunichar c, double quantile)
{
	int *counts, res;
	unsigned int lno, cno;

	if (lines->len == 0)
		return 0;

	counts = g_new (int, lines->len);
	for (lno = cno = 0; lno < lines->len; lno++) {
		int count = 0;
		GPtrArray *boxline = g_ptr_array_index (lines, lno);
		char const *line = g_ptr_array_index (boxline, 0);

		/* Ignore empty lines.  */
		if (*line == 0)
			continue;

		while (*line) {
			if (g_utf8_get_char (line) == c)
				count++;
			line = g_utf8_next_char (line);
		}

		counts[cno++] = count;
	}

	if (cno == 0)
		res = 0;
	else {
		unsigned int qi = (unsigned int)ceil (quantile * cno);
		qsort (counts, cno, sizeof (counts[0]), int_sort);
		if (qi == cno)
			qi--;
		res = counts[qi];
	}

	g_free (counts);

	return res;
}

static void
dump_guessed_options (const StfParseOptions_t *res)
{
	GSList *l;
	char ubuffer[6 + 1];

	g_printerr ("Guessed format:\n");
	switch (res->parsetype) {
	case PARSE_TYPE_CSV:
		g_printerr ("  type = sep\n");
		g_printerr ("  separator = %s\n",
			    res->sep.chr ? res->sep.chr : "(none)");
		g_printerr ("    see two as one = %s\n",
			    res->sep.duplicates ? "yes" : "no");
		break;
	case PARSE_TYPE_FIXED:
		g_printerr ("  type = sep\n");
		break;
	default:
		;
	}
	g_printerr ("  trim space = %d\n", res->trim_spaces);

	ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
	g_printerr ("  string indicator = %s\n", ubuffer);
	g_printerr ("    see two as one = %s\n",
		    res->indicator_2x_is_single ? "yes" : "no");

	g_printerr ("  line terminators =");
	for (l = res->terminator; l; l = l->next) {
		const char *t = l->data;
		if (strcmp (t, "\n"))
			g_printerr (" unix");
		else if (strcmp (t, "\r"))
			g_printerr (" mac");
		else if (strcmp (t, "\r\n"))
			g_printerr (" dos");
	}
	g_printerr ("\n");
}

/**
 * stf_parse_options_guess:
 * @data: the input data.
 *
 * Returns: (transfer full): the guessed options.
 **/
StfParseOptions_t *
stf_parse_options_guess (char const *data)
{
	StfParseOptions_t *res;
	GStringChunk *lines_chunk;
	GPtrArray *lines;
	int tabcount;
	int sepcount;
	gunichar sepchar = go_locale_get_arg_sep ();

	g_return_val_if_fail (data != NULL, NULL);

	res = stf_parse_options_new ();
	lines_chunk = g_string_chunk_new (100 * 1024);
	lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);

	tabcount = count_character (lines, '\t', 0.2);
	sepcount = count_character (lines, sepchar, 0.2);

	/* At least one tab per line and enough to separate every
	   would-be sepchars.  */
	if (tabcount >= 1 && tabcount >= sepcount - 1)
		stf_parse_options_csv_set_separators (res, "\t", NULL);
	else {
		gunichar c;

		/*
		 * Try a few more or less likely characters and pick the first
		 * one that occurs on at least half the lines.
		 *
		 * The order is mostly random, although ' ' and '!' which
		 * could very easily occur in text are put last.
		 */
		if (count_character (lines, (c = sepchar), 0.5) > 0 ||
		    count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
		    count_character (lines, (c = ':'), 0.5) > 0 ||
		    count_character (lines, (c = ','), 0.5) > 0 ||
		    count_character (lines, (c = ';'), 0.5) > 0 ||
		    count_character (lines, (c = '|'), 0.5) > 0 ||
		    count_character (lines, (c = '!'), 0.5) > 0 ||
		    count_character (lines, (c = ' '), 0.5) > 0) {
			char sep[7];
			sep[g_unichar_to_utf8 (c, sep)] = 0;
			if (c == ' ')
				strcat (sep, "\t");
			stf_parse_options_csv_set_separators (res, sep, NULL);
		}
	}

	if (1) {
		/* Separated */
		gboolean dups =
			res->sep.chr &&
			strchr (res->sep.chr, ' ') != NULL;
		gboolean trim =
			res->sep.chr &&
			strchr (res->sep.chr, ' ') != NULL;

		stf_parse_options_set_type (res, PARSE_TYPE_CSV);
		stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
		stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
		stf_parse_options_csv_set_duplicates (res, dups);
		stf_parse_options_csv_set_trim_seps (res, trim);

		stf_parse_options_csv_set_stringindicator (res, '"');
	} else {
		/* Fixed-width */
	}

	stf_parse_general_free (lines);
	g_string_chunk_free (lines_chunk);

	if (gnm_debug_flag ("stf"))
		dump_guessed_options (res);

	return res;
}

/**
 * stf_parse_options_guess_csv:
 * @data: the CSV input data.
 *
 * Returns: (transfer full): the guessed options.
 **/
StfParseOptions_t *
stf_parse_options_guess_csv (char const *data)
{
	StfParseOptions_t *res;
	GStringChunk *lines_chunk;
	GPtrArray *lines;
	char *sep = NULL;
	char const *quoteline = NULL;
	int pass;
	gunichar stringind = '"';

	g_return_val_if_fail (data != NULL, NULL);

	res = stf_parse_options_new ();
	stf_parse_options_set_type (res, PARSE_TYPE_CSV);
	stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
	stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
	stf_parse_options_csv_set_duplicates (res, FALSE);
	stf_parse_options_csv_set_trim_seps (res, FALSE);
	stf_parse_options_csv_set_stringindicator (res, stringind);

	lines_chunk = g_string_chunk_new (100 * 1024);
	lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);

	/*
	 * Find a line containing a quote; skip first line unless it is
	 * the only one.  Prefer a line with the quote first.
	 */
	for (pass = 1; !quoteline && pass <= 2; pass++) {
		size_t lno;
		for (lno = MIN (1, lines->len - 1);
		     !quoteline && lno < lines->len;
		     lno++) {
			GPtrArray *boxline = g_ptr_array_index (lines, lno);
			const char *line = g_ptr_array_index (boxline, 0);
			switch (pass) {
			case 1:
				if (g_utf8_get_char (line) == stringind)
					quoteline = line;
				break;
			case 2:
				if (my_utf8_strchr (line, stringind))
					quoteline = line;
				break;
			}
		}
	}

	if (quoteline) {
		const char *p0 = my_utf8_strchr (quoteline, stringind);
		const char *p = p0;

		do {
			p = g_utf8_next_char (p);
		} while (*p && g_utf8_get_char (p) != stringind);
		if (*p) p = g_utf8_next_char (p);
		while (*p && g_unichar_isspace (g_utf8_get_char (p)))
			p = g_utf8_next_char (p);
		if (*p) {
			/* Use the character after the quote.  */
			sep = g_strndup (p, g_utf8_next_char (p) - p);
		} else {
			/* Try to use character before the quote.  */
			while (p0 > quoteline && !sep) {
				p = p0;
				p0 = g_utf8_prev_char (p0);
				if (!g_unichar_isspace (g_utf8_get_char (p0)))
					sep = g_strndup (p0, p - p0);
			}
		}
	}

	if (!sep)
		sep = g_strdup (",");
	stf_parse_options_csv_set_separators (res, sep, NULL);
	g_free (sep);

	stf_parse_general_free (lines);
	g_string_chunk_free (lines_chunk);

	if (gnm_debug_flag ("stf"))
		dump_guessed_options (res);

	return res;
}
