diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx index 658a8aa..2ece1cd 100644 --- a/src/hunspell/affixmgr.cxx +++ b/src/hunspell/affixmgr.cxx @@ -87,11 +87,19 @@ #include "csutil.hxx" +#ifdef HUNSPELL_CHROME_CLIENT +AffixMgr::AffixMgr(hunspell::BDictReader* reader, + const std::vector& ptr) + : alldic(ptr) + , pHMgr(ptr[0]) { + bdict_reader = reader; +#else AffixMgr::AffixMgr(const char* affpath, const std::vector& ptr, const char* key) : alldic(ptr) , pHMgr(ptr[0]) { +#endif // register hash manager and load affix data from aff file csconv = NULL; @@ -166,9 +174,17 @@ AffixMgr::AffixMgr(const char* affpath, sFlag[i] = NULL; } +#ifdef HUNSPELL_CHROME_CLIENT + // Define dummy parameters for parse_file() to avoid changing the parameters + // of parse_file(). This may make it easier to merge the changes of the + // original hunspell. + const char* affpath = NULL; + const char* key = NULL; +#else for (int j = 0; j < CONTSIZE; j++) { contclasses[j] = 0; } +#endif if (parse_file(affpath, key)) { HUNSPELL_WARNING(stderr, "Failure loading aff file %s\n", affpath); @@ -246,7 +262,44 @@ void AffixMgr::finishFileMgr(FileMgr* afflst) { // read in aff file and build up prefix and suffix entry objects int AffixMgr::parse_file(const char* affpath, const char* key) { + std::string line; +#ifdef HUNSPELL_CHROME_CLIENT + // open the affix file + // We're always UTF-8 + utf8 = 1; + + // A BDICT file stores PFX and SFX lines in a special section and it provides + // a special line iterator for reading PFX and SFX lines. + // We create a FileMgr object from this iterator and parse PFX and SFX lines + // before parsing other lines. + hunspell::LineIterator affix_iterator = bdict_reader->GetAffixLineIterator(); + FileMgr* iterator = new FileMgr(&affix_iterator); + if (!iterator) { + HUNSPELL_WARNING(stderr, + "error: could not create a FileMgr from an affix line iterator.\n"); + return 1; + } + + while (iterator->getline(line)) { + char ft = ' '; + if (line.compare(0, 3, "PFX") == 0) ft = complexprefixes ? 'S' : 'P'; + if (line.compare(0, 3, "SFX") == 0) ft = complexprefixes ? 'P' : 'S'; + if (ft != ' ') + parse_affix(line, ft, iterator, NULL); + } + delete iterator; + // Create a FileMgr object for reading lines except PFX and SFX lines. + // We don't need to change the loop below since our FileMgr emulates the + // original one. + hunspell::LineIterator other_iterator = bdict_reader->GetOtherLineIterator(); + FileMgr * afflst = new FileMgr(&other_iterator); + if (!afflst) { + HUNSPELL_WARNING(stderr, + "error: could not create a FileMgr from an other line iterator.\n"); + return 1; + } +#else // checking flag duplication char dupflags[CONTSIZE]; char dupflags_ini = 1; @@ -261,16 +314,17 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { stderr, "error: could not open affix description file %s\n", affpath); return 1; } +#endif // step one is to parse the affix file building up the internal // affix data structures // read in each line ignoring any that do not // start with a known line type indicator - std::string line; while (afflst->getline(line)) { mychomp(line); +#ifndef HUNSPELL_CHROME_CLIENT /* remove byte order mark */ if (firstline) { firstline = 0; @@ -280,6 +334,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { line.erase(0, 3); } } +#endif /* parse in the keyboard string */ if (line.compare(0, 3, "KEY", 3) == 0) { @@ -532,6 +587,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { } } +#ifndef HUNSPELL_CHROME_CLIENT /* parse in the typical fault correcting table */ if (line.compare(0, 3, "REP", 3) == 0) { if (!parse_reptable(line, afflst)) { @@ -539,6 +595,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { return 1; } } +#endif /* parse in the input conversion table */ if (line.compare(0, 5, "ICONV", 5) == 0) { @@ -688,6 +745,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { checksharps = 1; } +#ifndef HUNSPELL_CHROME_CLIENT /* parse this affix: P - prefix, S - suffix */ // affix type char ft = ' '; @@ -705,6 +763,7 @@ int AffixMgr::parse_file(const char* affpath, const char* key) { return 1; } } +#endif } finishFileMgr(afflst); @@ -1281,6 +1340,24 @@ std::string AffixMgr::prefix_check_twosfx_morph(const char* word, // Is word a non compound with a REP substitution (see checkcompoundrep)? int AffixMgr::cpdrep_check(const char* word, int wl) { +#ifdef HUNSPELL_CHROME_CLIENT + const char *pattern, *pattern2; + hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator(); + while (iterator.GetNext(&pattern, &pattern2)) { + const char* r = word; + const size_t lenr = strlen(pattern2); + const size_t lenp = strlen(pattern); + + // search every occurence of the pattern in the word + while ((r=strstr(r, pattern)) != NULL) { + std::string candidate(word); + candidate.replace(r-word, lenp, pattern2); + if (candidate_check(candidate.c_str(), candidate.size())) return 1; + r++; // search for the next letter + } + } + +#else if ((wl < 2) || reptable.empty()) return 0; @@ -1299,6 +1376,7 @@ int AffixMgr::cpdrep_check(const char* word, int wl) { ++r; // search for the next letter } } +#endif return 0; } @@ -4489,6 +4567,7 @@ bool AffixMgr::parse_affix(const std::string& line, case 1: { np++; aflag = pHMgr->decode_flag(std::string(start_piece, iter).c_str()); +#ifndef HUNSPELL_CHROME_CLIENT // We don't check for duplicates. if (((at == 'S') && (dupflags[aflag] & dupSFX)) || ((at == 'P') && (dupflags[aflag] & dupPFX))) { HUNSPELL_WARNING( @@ -4497,6 +4576,7 @@ bool AffixMgr::parse_affix(const std::string& line, af->getlinenum()); } dupflags[aflag] += (char)((at == 'S') ? dupSFX : dupPFX); +#endif break; } // piece 3 - is cross product indicator diff --git a/src/hunspell/affixmgr.hxx b/src/hunspell/affixmgr.hxx index 83a4b42..162b475 100644 --- a/src/hunspell/affixmgr.hxx +++ b/src/hunspell/affixmgr.hxx @@ -92,6 +92,40 @@ class PfxEntry; class SfxEntry; +#ifdef HUNSPELL_CHROME_CLIENT + +#include + +// This class provides an implementation of the contclasses array in AffixMgr +// that is normally a large static array. We should almost never need more than +// 256 elements, so this class only allocates that much to start off with. If +// elements higher than that are actually used, we'll automatically expand. +class ContClasses { + public: + ContClasses() { + // Pre-allocate a buffer so that typically, we'll never have to resize. + EnsureSizeIs(256); + } + + char& operator[](size_t index) { + EnsureSizeIs(index + 1); + return data[index]; + } + + void EnsureSizeIs(size_t new_size) { + if (data.size() >= new_size) + return; // Nothing to do. + + size_t old_size = data.size(); + data.resize(new_size); + memset(&data[old_size], 0, new_size - old_size); + } + + std::vector data; +}; + +#endif // HUNSPELL_CHROME_CLIENT + class AffixMgr { PfxEntry* pStart[SETSIZE]; SfxEntry* sStart[SETSIZE]; @@ -175,11 +209,19 @@ class AffixMgr { int fullstrip; int havecontclass; // boolean variable +#ifdef HUNSPELL_CHROME_CLIENT + ContClasses contclasses; +#else char contclasses[CONTSIZE]; // flags of possible continuing classes (twofold // affix) +#endif public: +#ifdef HUNSPELL_CHROME_CLIENT + AffixMgr(hunspell::BDictReader* reader, const std::vector& ptr); +#else AffixMgr(const char* affpath, const std::vector& ptr, const char* key = NULL); +#endif ~AffixMgr(); struct hentry* affix_check(const char* word, int len, @@ -337,6 +379,10 @@ class AffixMgr { int get_fullstrip() const; private: +#ifdef HUNSPELL_CHROME_CLIENT + // Not owned by us, owned by the Hunspell object. + hunspell::BDictReader* bdict_reader; +#endif int parse_file(const char* affpath, const char* key); bool parse_flag(const std::string& line, unsigned short* out, FileMgr* af); bool parse_num(const std::string& line, int* out, FileMgr* af); diff --git a/src/hunspell/filemgr.cxx b/src/hunspell/filemgr.cxx index b7c89b2..aef6dba 100644 --- a/src/hunspell/filemgr.cxx +++ b/src/hunspell/filemgr.cxx @@ -78,6 +78,32 @@ #include "filemgr.hxx" #include "csutil.hxx" +#ifdef HUNSPELL_CHROME_CLIENT +#include "third_party/hunspell/google/bdict_reader.h" + +FileMgr::FileMgr(hunspell::LineIterator* iterator) : iterator_(iterator) { +} + +FileMgr::~FileMgr() { +} + +bool FileMgr::getline(std::string& line) { + // Read one line from a BDICT file and return it, if we can read a line + // without errors. + bool result = iterator_->AdvanceAndCopy(line_, BUFSIZE - 1); + if (result) + line = line_; + return result; +} + +int FileMgr::getlinenum() { + // This function is used only for displaying a line number that causes a + // parser error. For a BDICT file, providing a line number doesn't help + // identifying the place where causes a parser error so much since it is a + // binary file. So, we just return 0. + return 0; +} +#else int FileMgr::fail(const char* err, const char* par) { fprintf(stderr, err, par); return -1; @@ -118,3 +144,4 @@ bool FileMgr::getline(std::string& dest) { int FileMgr::getlinenum() { return linenum; } +#endif diff --git a/src/hunspell/filemgr.hxx b/src/hunspell/filemgr.hxx index 991e924..65fae23 100644 --- a/src/hunspell/filemgr.hxx +++ b/src/hunspell/filemgr.hxx @@ -80,6 +80,30 @@ #include #include +#ifdef HUNSPELL_CHROME_CLIENT +namespace hunspell { +class LineIterator; +} // namespace hunspell + +// A class which encapsulates operations of reading a BDICT file. +// Chrome uses a BDICT file to compress hunspell dictionaries. A BDICT file is +// a binary file converted from a DIC file and an AFF file. (See +// "bdict_reader.h" for its format.) +// This class encapsulates the operations of reading a BDICT file and emulates +// the original FileMgr operations for AffixMgr so that it can read a BDICT +// file without so many changes. +class FileMgr { + public: + FileMgr(hunspell::LineIterator* iterator); + ~FileMgr(); + bool getline(std::string& line); + int getlinenum(); + + protected: + hunspell::LineIterator* iterator_; + char line_[BUFSIZE + 50]; // input buffer +}; +#else class FileMgr { private: FileMgr(const FileMgr&); @@ -99,3 +123,4 @@ class FileMgr { int getlinenum(); }; #endif +#endif diff --git a/src/hunspell/hashmgr.cxx b/src/hunspell/hashmgr.cxx index 1de1690..770fac1 100644 --- a/src/hunspell/hashmgr.cxx +++ b/src/hunspell/hashmgr.cxx @@ -84,8 +84,14 @@ // build a hash table from a munched word list +#ifdef HUNSPELL_CHROME_CLIENT +HashMgr::HashMgr(hunspell::BDictReader* reader) + : bdict_reader(reader), +#else HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) - : tablesize(0), + : +#endif + tablesize(0), tableptr(NULL), flag_mode(FLAG_CHAR), complexprefixes(0), @@ -99,8 +105,14 @@ HashMgr::HashMgr(const char* tpath, const char* apath, const char* key) aliasm(NULL) { langnum = 0; csconv = 0; +#ifdef HUNSPELL_CHROME_CLIENT + // No tables to load, just the AF lines. + load_config(NULL, NULL); + int ec = LoadAFLines(); +#else load_config(apath, key); int ec = load_tables(tpath, key); +#endif if (ec) { /* error condition - what should we do here */ HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec); @@ -158,14 +170,57 @@ HashMgr::~HashMgr() { #endif #endif +#ifdef HUNSPELL_CHROME_CLIENT + EmptyHentryCache(); + for (std::vector::iterator it = pointer_to_strings_.begin(); + it != pointer_to_strings_.end(); ++it) { + delete *it; + } +#endif #ifdef MOZILLA_CLIENT delete[] csconv; #endif } +#ifdef HUNSPELL_CHROME_CLIENT +void HashMgr::EmptyHentryCache() { + // We need to delete each cache entry, and each additional one in the linked + // list of homonyms. + for (HEntryCache::iterator i = hentry_cache.begin(); + i != hentry_cache.end(); ++i) { + hentry* cur = i->second; + while (cur) { + hentry* next = cur->next_homonym; + DeleteHashEntry(cur); + cur = next; + } + } + hentry_cache.clear(); +} +#endif + // lookup a root word in the hashtable struct hentry* HashMgr::lookup(const char* word) const { +#ifdef HUNSPELL_CHROME_CLIENT + int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; + int affix_count = bdict_reader->FindWord(word, affix_ids); + if (affix_count == 0) { // look for custom added word + std::map::const_iterator iter = + custom_word_to_affix_id_map_.find(word); + if (iter != custom_word_to_affix_id_map_.end()) { + affix_count = 1; + affix_ids[0] = iter->second; + } + } + + static const int kMaxWordLen = 128; + static char word_buf[kMaxWordLen]; + // To take account of null-termination, we use upto 127. + strncpy(word_buf, word, kMaxWordLen - 1); + + return AffixIDsToHentry(word_buf, affix_ids, affix_count); +#else struct hentry* dp; if (tableptr) { dp = tableptr[hash(word)]; @@ -177,6 +232,7 @@ struct hentry* HashMgr::lookup(const char* word) const { } } return NULL; +#endif } // add a word to the hash table (private) @@ -186,6 +242,8 @@ int HashMgr::add_word(const std::string& in_word, int al, const std::string* in_desc, bool onlyupcase) { +// TODO: The following 40 lines or so are actually new. Should they be included? +#ifndef HUNSPELL_CHROME_CLIENT const std::string* word = &in_word; const std::string* desc = in_desc; @@ -318,6 +376,17 @@ int HashMgr::add_word(const std::string& in_word, delete desc_copy; delete word_copy; +#else + std::map::iterator iter = + custom_word_to_affix_id_map_.find(in_word); + if (iter == custom_word_to_affix_id_map_.end()) { // word needs to be added + std::string* new_string_word = new std::string(in_word); + pointer_to_strings_.push_back(new_string_word); + base::StringPiece sp(*(new_string_word)); + custom_word_to_affix_id_map_[sp] = 0; // no affixes for custom words + return 1; + } +#endif return 0; } @@ -378,6 +447,12 @@ int HashMgr::get_clen_and_captype(const std::string& word, int* captype) { // remove word (personal dictionary function for standalone applications) int HashMgr::remove(const std::string& word) { +#ifdef HUNSPELL_CHROME_CLIENT + std::map::iterator iter = + custom_word_to_affix_id_map_.find(word); + if (iter != custom_word_to_affix_id_map_.end()) + custom_word_to_affix_id_map_.erase(iter); +#else struct hentry* dp = lookup(word.c_str()); while (dp) { if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) { @@ -395,6 +470,7 @@ int HashMgr::remove(const std::string& word) { } dp = dp->next_homonym; } +#endif return 0; } @@ -469,6 +545,44 @@ int HashMgr::add_with_affix(const std::string& word, const std::string& example) // walk the hash table entry by entry - null at end // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp); struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { +#ifdef HUNSPELL_CHROME_CLIENT + // Return NULL if dictionary is not valid. + if (!bdict_reader->IsValid()) + return NULL; + + // This function is only ever called by one place and not nested. We can + // therefore keep static state between calls and use |col| as a "reset" flag + // to avoid changing the API. It is set to -1 for the first call. + // Allocate the iterator on the heap to prevent an exit time destructor. + static hunspell::WordIterator& word_iterator = + *new hunspell::WordIterator(bdict_reader->GetAllWordIterator()); + if (col < 0) { + col = 1; + word_iterator = bdict_reader->GetAllWordIterator(); + } + + int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; + static const int kMaxWordLen = 128; + static char word[kMaxWordLen]; + int affix_count = word_iterator.Advance(word, kMaxWordLen, affix_ids); + if (affix_count == 0) + return NULL; + short word_len = static_cast(strlen(word)); + + // Since hunspell 1.2.8, an hentry struct becomes a variable-length struct, + // i.e. a struct which uses its array 'word[1]' as a variable-length array. + // As noted above, this function is not nested. So, we just use a static + // struct which consists of an hentry and a char[kMaxWordLen], and initialize + // the static struct and return it for now. + // No need to create linked lists for the extra affixes. + static struct { + hentry entry; + char word[kMaxWordLen]; + } hash_entry; + + return InitHashEntry(&hash_entry.entry, sizeof(hash_entry), + &word[0], word_len, affix_ids[0]); +#else if (hp && hp->next != NULL) return hp->next; for (col++; col < tablesize; col++) { @@ -478,10 +592,12 @@ struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const { // null at end and reset to start col = -1; return NULL; +#endif } // load a munched word list and build a hash table on the fly int HashMgr::load_tables(const char* tpath, const char* key) { +#ifndef HUNSPELL_CHROME_CLIENT // open dictionary file FileMgr* dict = new FileMgr(tpath, key); if (dict == NULL) @@ -610,12 +726,16 @@ int HashMgr::load_tables(const char* tpath, const char* key) { } delete dict; +#endif return 0; } // the hash function is a simple load and rotate // algorithm borrowed int HashMgr::hash(const char* word) const { +#ifdef HUNSPELL_CHROME_CLIENT + return 0; +#else unsigned long hv = 0; for (int i = 0; i < 4 && *word != 0; i++) hv = (hv << 8) | (*word++); @@ -624,6 +744,7 @@ int HashMgr::hash(const char* word) const { hv ^= (*word++); } return (unsigned long)hv % tablesize; +#endif } int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const { @@ -833,7 +954,12 @@ int HashMgr::load_config(const char* affpath, const char* key) { int firstline = 1; // open the affix file +#ifdef HUNSPELL_CHROME_CLIENT + hunspell::LineIterator iterator = bdict_reader->GetOtherLineIterator(); + FileMgr * afflst = new FileMgr(&iterator); +#else FileMgr* afflst = new FileMgr(affpath, key); +#endif if (!afflst) { HUNSPELL_WARNING( stderr, "Error - could not open affix description file %s\n", affpath); @@ -1062,6 +1188,122 @@ bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) { return true; } +#ifdef HUNSPELL_CHROME_CLIENT +int HashMgr::LoadAFLines() +{ + utf8 = 1; // We always use UTF-8. + + // Read in all the AF lines which tell us the rules for each affix group ID. + hunspell::LineIterator iterator = bdict_reader->GetAfLineIterator(); + FileMgr afflst(&iterator); + std::string line; + while (afflst.getline(line)) { + int rv = parse_aliasf(line, &afflst); + if (rv) + return rv; + } + + return 0; +} + +hentry* HashMgr::InitHashEntry(hentry* entry, + size_t item_size, + const char* word, + int word_length, + int affix_index) const { + // Return if the given buffer doesn't have enough space for a hentry struct + // or the given word is too long. + // Our BDICT cannot handle words longer than (128 - 1) bytes. So, it is + // better to return an error if the given word is too long and prevent + // an unexpected result caused by a long word. + const int kMaxWordLen = 128; + if (item_size < sizeof(hentry) + word_length + 1 || + word_length >= kMaxWordLen) + return NULL; + + // Initialize a hentry struct with the given parameters, and + // append the given string at the end of this hentry struct. + memset(entry, 0, item_size); + FileMgr af(NULL); + entry->alen = static_cast( + const_cast(this)->get_aliasf(affix_index, &entry->astr, &af)); + entry->blen = static_cast(word_length); + memcpy(&entry->word, word, word_length); + + return entry; +} + +hentry* HashMgr::CreateHashEntry(const char* word, + int word_length, + int affix_index) const { + // Return if the given word is too long. + // (See the comment in HashMgr::InitHashEntry().) + const int kMaxWordLen = 128; + if (word_length >= kMaxWordLen) + return NULL; + + const size_t kEntrySize = sizeof(hentry) + word_length + 1; + struct hentry* entry = reinterpret_cast(malloc(kEntrySize)); + if (entry) + InitHashEntry(entry, kEntrySize, word, word_length, affix_index); + + return entry; +} + +void HashMgr::DeleteHashEntry(hentry* entry) const { + free(entry); +} + +hentry* HashMgr::AffixIDsToHentry(char* word, + int* affix_ids, + int affix_count) const +{ + if (affix_count == 0) + return NULL; + + HEntryCache& cache = const_cast(this)->hentry_cache; + std::string std_word(word); + HEntryCache::iterator found = cache.find(std_word); + if (found != cache.end()) { + // We must return an existing hentry for the same word if we've previously + // handed one out. Hunspell will compare pointers in some cases to see if + // two words it has found are the same. + return found->second; + } + + short word_len = static_cast(strlen(word)); + + // We can get a number of prefixes per word. There will normally be only one, + // but if not, there will be a linked list of "hentry"s for the "homonym"s + // for the word. + struct hentry* first_he = NULL; + struct hentry* prev_he = NULL; // For making linked list. + for (int i = 0; i < affix_count; i++) { + struct hentry* he = CreateHashEntry(word, word_len, affix_ids[i]); + if (!he) + break; + if (i == 0) + first_he = he; + if (prev_he) + prev_he->next_homonym = he; + prev_he = he; + } + + cache[std_word] = first_he; // Save this word in the cache for later. + return first_he; +} + +hentry* HashMgr::GetHentryFromHEntryCache(char* word) { + HEntryCache& cache = const_cast(this)->hentry_cache; + std::string std_word(word); + HEntryCache::iterator found = cache.find(std_word); + if (found != cache.end()) + return found->second; + else + return NULL; +} +#endif + int HashMgr::is_aliasf() const { return (aliasf != NULL); } diff --git a/src/hunspell/hashmgr.hxx b/src/hunspell/hashmgr.hxx index 812171a..af27d6c 100644 --- a/src/hunspell/hashmgr.hxx +++ b/src/hunspell/hashmgr.hxx @@ -82,9 +82,23 @@ #include "filemgr.hxx" #include "w_char.hxx" +#ifdef HUNSPELL_CHROME_CLIENT +#include + +#include "base/stl_util.h" +#include "base/strings/string_piece.h" +#include "third_party/hunspell/google/bdict_reader.h" +#endif + enum flag { FLAG_CHAR, FLAG_LONG, FLAG_NUM, FLAG_UNI }; class HashMgr { +#ifdef HUNSPELL_CHROME_CLIENT + // Not owned by this class, owned by the Hunspell object. + hunspell::BDictReader* bdict_reader; + std::map custom_word_to_affix_id_map_; + std::vector pointer_to_strings_; +#endif int tablesize; struct hentry** tableptr; flag flag_mode; @@ -104,7 +118,23 @@ class HashMgr { char** aliasm; public: +#ifdef HUNSPELL_CHROME_CLIENT + HashMgr(hunspell::BDictReader* reader); + + // Return the hentry corresponding to the given word. Returns NULL if the + // word is not there in the cache. + hentry* GetHentryFromHEntryCache(char* word); + + // Called before we do a new operation. This will empty the cache of pointers + // to hentries that we have cached. In Chrome, we make these on-demand, but + // they must live as long as the single spellcheck operation that they're part + // of since Hunspell will save pointers to various ones as it works. + // + // This function allows that cache to be emptied and not grow infinitely. + void EmptyHentryCache(); +#else HashMgr(const char* tpath, const char* apath, const char* key = NULL); +#endif ~HashMgr(); struct hentry* lookup(const char*) const; @@ -134,6 +164,40 @@ class HashMgr { bool onlyupcase); int load_config(const char* affpath, const char* key); bool parse_aliasf(const std::string& line, FileMgr* af); + +#ifdef HUNSPELL_CHROME_CLIENT + // Loads the AF lines from a BDICT. + // A BDICT file compresses its AF lines to save memory. + // This function decompresses each AF line and call parse_aliasf(). + int LoadAFLines(); + + // Helper functions that create a new hentry struct, initialize it, and + // delete it. + // These functions encapsulate non-trivial operations in creating and + // initializing a hentry struct from BDICT data to avoid changing code so much + // even when a hentry struct is changed. + hentry* InitHashEntry(hentry* entry, + size_t item_size, + const char* word, + int word_length, + int affix_index) const; + hentry* CreateHashEntry(const char* word, + int word_length, + int affix_index) const; + void DeleteHashEntry(hentry* entry) const; + + // Converts the list of affix IDs to a linked list of hentry structures. The + // hentry structures will point to the given word. The returned pointer will + // be a statically allocated variable that will change for the next call. The + // |word| buffer must be the same. + hentry* AffixIDsToHentry(char* word, int* affix_ids, int affix_count) const; + + // See EmptyHentryCache above. Note that each one is actually a linked list + // followed by the homonym pointer. + typedef std::map HEntryCache; + HEntryCache hentry_cache; +#endif + int add_hidden_capitalized_word(const std::string& word, int wcl, unsigned short* flags, diff --git a/src/hunspell/htypes.hxx b/src/hunspell/htypes.hxx index 1e6c118..c3aa2be 100644 --- a/src/hunspell/htypes.hxx +++ b/src/hunspell/htypes.hxx @@ -41,6 +41,16 @@ #ifndef HTYPES_HXX_ #define HTYPES_HXX_ +#ifdef HUNSPELL_CHROME_CLIENT +// This is a workaround for preventing errors in parsing Turkish BDICs, which +// contain very long AF lines (~ 12,000 chars). +// TODO(hbono) change the HashMgr::parse_aliasf() function to be able to parse +// longer lines than MAXDELEN. +#define MAXDELEN (8192 * 2) +#else +#define MAXDELEN 8192 +#endif // HUNSPELL_CHROME_CLIENT + #define ROTATE_LEN 5 #define ROTATE(v, q) \ diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx index abcdb8f..c8c5cf4 100644 --- a/src/hunspell/hunspell.cxx +++ b/src/hunspell/hunspell.cxx @@ -79,6 +79,9 @@ #include "hunspell.hxx" #include "suggestmgr.hxx" #include "hunspell.h" +#ifndef HUNSPELL_CHROME_CLIENT +# include "config.h" +#endif #include "csutil.hxx" #include #include -#define MAXWORDUTF8LEN (MAXWORDLEN * 2) +#define MAXWORDUTF8LEN (MAXWORDLEN * 3) class HunspellImpl { public: +#ifdef HUNSPELL_CHROME_CLIENT + HunspellImpl(const unsigned char* bdict_data, size_t bdict_length); +#else HunspellImpl(const char* affpath, const char* dpath, const char* key); +#endif ~HunspellImpl(); +#ifndef HUNSPELL_CHROME_CLIENT int add_dic(const char* dpath, const char* key); +#endif std::vector suffix_suggest(const std::string& root_word); std::vector generate(const std::string& word, const std::vector& pl); std::vector generate(const std::string& word, const std::string& pattern); @@ -116,7 +125,9 @@ private: AffixMgr* pAMgr; std::vector m_HMgrs; SuggestMgr* pSMgr; +#ifndef HUNSPELL_CHROME_CLIENT // We are using BDict instead. char* affixpath; +#endif std::string encoding; struct cs_info* csconv; int langnum; @@ -124,6 +135,11 @@ private: int complexprefixes; std::vector wordbreak; +#ifdef HUNSPELL_CHROME_CLIENT + // Not owned by us, owned by the Hunspell object. + hunspell::BDictReader* bdict_reader; +#endif + private: void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev); size_t cleanword2(std::string& dest, @@ -153,22 +169,43 @@ private: HunspellImpl& operator=(const HunspellImpl&); }; +#ifdef HUNSPELL_CHROME_CLIENT +Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length) + : m_Impl(new HunspellImpl(bdict_data, bdict_length)) { +#else Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key) : m_Impl(new HunspellImpl(affpath, dpath, key)) { +#endif } +#ifdef HUNSPELL_CHROME_CLIENT +HunspellImpl::HunspellImpl(const unsigned char* bdict_data, size_t bdict_length) { +#else HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) { +#endif csconv = NULL; utf8 = 0; complexprefixes = 0; +#ifndef HUNSPELL_CHROME_CLIENT affixpath = mystrdup(affpath); +#endif + +#ifdef HUNSPELL_CHROME_CLIENT + bdict_reader = new hunspell::BDictReader; + bdict_reader->Init(bdict_data, bdict_length); /* first set up the hash manager */ + m_HMgrs.push_back(new HashMgr(bdict_reader)); + + pAMgr = new AffixMgr(bdict_reader, m_HMgrs); // TODO: 'key' ? +#else + /* first set up the hash manager */ m_HMgrs.push_back(new HashMgr(dpath, affpath, key)); /* next set up the affix manager */ /* it needs access to the hash manager lookup methods */ pAMgr = new AffixMgr(affpath, m_HMgrs, key); +#endif /* get the preferred try string and the dictionary */ /* encoding from the Affix Manager for that dictionary */ @@ -185,7 +222,11 @@ HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* k strcpy(&dic_encoding_vec[0], encoding.c_str()); /* and finally set up the suggestion manager */ +#ifdef HUNSPELL_CHROME_CLIENT + pSMgr = new SuggestMgr(bdict_reader, try_string, MAXSUGGESTION, pAMgr); +#else pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr); +#endif if (try_string) free(try_string); } @@ -205,11 +246,17 @@ HunspellImpl::~HunspellImpl() { delete[] csconv; #endif csconv = NULL; +#ifdef HUNSPELL_CHROME_CLIENT + if (bdict_reader) delete bdict_reader; + bdict_reader = NULL; +#else if (affixpath) free(affixpath); affixpath = NULL; +#endif } +#ifndef HUNSPELL_CHROME_CLIENT // load extra dictionaries int Hunspell::add_dic(const char* dpath, const char* key) { return m_Impl->add_dic(dpath, key); @@ -222,6 +269,7 @@ int HunspellImpl::add_dic(const char* dpath, const char* key) { m_HMgrs.push_back(new HashMgr(dpath, affixpath, key)); return 0; } +#endif // make a copy of src at destination while removing all leading // blanks and removing any trailing periods after recording @@ -415,6 +463,9 @@ bool Hunspell::spell(const std::string& word, int* info, std::string* root) { } bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) { +#ifdef HUNSPELL_CHROME_CLIENT + if (m_HMgrs[0]) m_HMgrs[0]->EmptyHentryCache(); +#endif struct hentry* rv = NULL; int info2 = 0; @@ -721,6 +772,13 @@ struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::str if (!len) return NULL; +#ifdef HUNSPELL_CHROME_CLIENT + // We need to check if the word length is valid to make coverity (Event + // fixed_size_dest: Possible overrun of N byte fixed size buffer) happy. + if ((utf8 && strlen(word) >= MAXWORDUTF8LEN) || (!utf8 && strlen(word) >= MAXWORDLEN)) + return NULL; +#endif + // word reversing wrapper for complex prefixes if (complexprefixes) { if (!usebuffer) { @@ -833,6 +891,9 @@ std::vector Hunspell::suggest(const std::string& word) { } std::vector HunspellImpl::suggest(const std::string& word) { +#ifdef HUNSPELL_CHROME_CLIENT + if (m_HMgrs[0]) m_HMgrs[0]->EmptyHentryCache(); +#endif std::vector slst; int onlycmpdsug = 0; @@ -1875,22 +1936,32 @@ int Hunspell::generate(char*** slst, const char* word, const char* pattern) { } Hunhandle* Hunspell_create(const char* affpath, const char* dpath) { +#ifdef HUNSPELL_CHROME_CLIENT + return NULL; +#else return (Hunhandle*)(new Hunspell(affpath, dpath)); +#endif } Hunhandle* Hunspell_create_key(const char* affpath, const char* dpath, const char* key) { +#ifdef HUNSPELL_CHROME_CLIENT + return NULL; +#else return reinterpret_cast(new Hunspell(affpath, dpath, key)); +#endif } void Hunspell_destroy(Hunhandle* pHunspell) { delete reinterpret_cast(pHunspell); } +#ifndef HUNSPELL_CHROME_CLIENT int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) { return reinterpret_cast(pHunspell)->add_dic(dpath); } +#endif int Hunspell_spell(Hunhandle* pHunspell, const char* word) { return reinterpret_cast(pHunspell)->spell(std::string(word)); diff --git a/src/hunspell/hunspell.hxx b/src/hunspell/hunspell.hxx index 43af66b..a35df83 100644 --- a/src/hunspell/hunspell.hxx +++ b/src/hunspell/hunspell.hxx @@ -79,6 +79,10 @@ #include #include +#ifdef HUNSPELL_CHROME_CLIENT +#include "third_party/hunspell/google/bdict_reader.h" +#endif + #define SPELL_XML "" #define MAXSUGGESTION 15 @@ -115,11 +119,17 @@ class LIBHUNSPELL_DLL_EXPORTED Hunspell { * long path names (without the long path prefix Hunspell will use fopen() * with system-dependent character encoding instead of _wfopen()). */ +#ifdef HUNSPELL_CHROME_CLIENT + Hunspell(const unsigned char* bdict_data, size_t bdict_length); +#else Hunspell(const char* affpath, const char* dpath, const char* key = NULL); +#endif ~Hunspell(); +#ifndef HUNSPELL_CHROME_CLIENT /* load extra dictionaries (only dic files) */ int add_dic(const char* dpath, const char* key = NULL); +#endif /* spell(word) - spellcheck word * output: false = bad word, true = good word diff --git a/src/hunspell/replist.cxx b/src/hunspell/replist.cxx index 89d4caa..756dc03 100644 --- a/src/hunspell/replist.cxx +++ b/src/hunspell/replist.cxx @@ -167,6 +167,7 @@ int RepList::add(const std::string& in_pat1, const std::string& pat2) { mystrrep(r->outstrings[type], "_", " "); dat[pos++] = r; // sort to the right place in the list +#if 0 int i; for (i = pos - 1; i > 0; i--) { int c = strncmp(r->pattern.c_str(), dat[i-1]->pattern.c_str(), dat[i-1]->pattern.size()); @@ -184,6 +185,15 @@ int RepList::add(const std::string& in_pat1, const std::string& pat2) { } memmove(dat + i + 1, dat + i, (pos - i - 1) * sizeof(replentry *)); dat[i] = r; +#else + for (int i = pos - 1; i > 0; i--) { + r = dat[i]; + if (r->pattern < dat[i - 1]->pattern) { + dat[i] = dat[i - 1]; + dat[i - 1] = r; + } else break; + } +#endif return 0; } diff --git a/src/hunspell/replist.hxx b/src/hunspell/replist.hxx index 2f9d350..3afe005 100644 --- a/src/hunspell/replist.hxx +++ b/src/hunspell/replist.hxx @@ -75,6 +75,12 @@ #ifndef REPLIST_HXX_ #define REPLIST_HXX_ +#ifdef HUNSPELL_CHROME_CLIENT +// Compilation issues in spellchecker.cc think near is a macro, therefore +// removing it here solves that problem. +#undef near +#endif + #include "w_char.hxx" #include diff --git a/src/hunspell/suggestmgr.cxx b/src/hunspell/suggestmgr.cxx index b998341..4e122da 100644 --- a/src/hunspell/suggestmgr.cxx +++ b/src/hunspell/suggestmgr.cxx @@ -82,7 +82,112 @@ const w_char W_VLINE = {'\0', '|'}; +#ifdef HUNSPELL_CHROME_CLIENT +namespace { +// A simple class which creates temporary hentry objects which are available +// only in a scope. To conceal memory operations from SuggestMgr functions, +// this object automatically deletes all hentry objects created through +// CreateScopedHashEntry() calls in its destructor. So, the following snippet +// raises a memory error. +// +// hentry* bad_copy = NULL; +// { +// ScopedHashEntryFactory factory; +// hentry* scoped_copy = factory.CreateScopedHashEntry(0, source); +// ... +// bad_copy = scoped_copy; +// } +// if (bad_copy->word[0]) // memory for scoped_copy has been deleted! +// +// As listed in the above snippet, it is simple to use this class. +// 1. Declare an instance of this ScopedHashEntryFactory, and; +// 2. Call its CreateHashEntry() member instead of using 'new hentry' or +// 'operator='. +// +class ScopedHashEntryFactory { + public: + ScopedHashEntryFactory(); + ~ScopedHashEntryFactory(); + + // Creates a temporary copy of the given hentry struct. + // The returned copy is available only while this object is available. + // NOTE: this function just calls memcpy() in creating a copy of the given + // hentry struct, i.e. it does NOT copy objects referred by pointers of the + // given hentry struct. + hentry* CreateScopedHashEntry(int index, const hentry* source); + + private: + // A struct which encapsulates the new hentry struct introduced in hunspell + // 1.2.8. For a pointer to an hentry struct 'h', hunspell 1.2.8 stores a word + // (including a NUL character) into 'h->word[0]',...,'h->word[h->blen]' even + // though arraysize(h->word[]) is 1. Also, it changed 'astr' to a pointer so + // it can store affix flags into 'h->astr[0]',...,'h->astr[alen-1]'. To handle + // this new hentry struct, we define a struct which combines three values: an + // hentry struct 'hentry'; a char array 'word[kMaxWordLen]', and; an unsigned + // short array 'astr' so a hentry struct 'h' returned from + // CreateScopedHashEntry() satisfies the following equations: + // hentry* h = factory.CreateScopedHashEntry(0, source); + // h->word[0] == ((HashEntryItem*)h)->entry.word[0]. + // h->word[1] == ((HashEntryItem*)h)->word[0]. + // ... + // h->word[h->blen] == ((HashEntryItem*)h)->word[h->blen-1]. + // h->astr[0] == ((HashEntryItem*)h)->astr[0]. + // h->astr[1] == ((HashEntryItem*)h)->astr[1]. + // ... + // h->astr[h->alen-1] == ((HashEntryItem*)h)->astr[h->alen-1]. + enum { + kMaxWordLen = 128, + kMaxAffixLen = 8, + }; + struct HashEntryItem { + hentry entry; + char word[kMaxWordLen]; + unsigned short astr[kMaxAffixLen]; + }; + + HashEntryItem hash_items_[MAX_ROOTS]; +}; + +ScopedHashEntryFactory::ScopedHashEntryFactory() { + memset(&hash_items_[0], 0, sizeof(hash_items_)); +} + +ScopedHashEntryFactory::~ScopedHashEntryFactory() { +} + +hentry* ScopedHashEntryFactory::CreateScopedHashEntry(int index, + const hentry* source) { + if (index >= MAX_ROOTS || source->blen >= kMaxWordLen) + return NULL; + + // Retrieve a HashEntryItem struct from our spool, initialize it, and + // returns the address of its 'hentry' member. + size_t source_size = sizeof(hentry) + source->blen + 1; + HashEntryItem* hash_item = &hash_items_[index]; + memcpy(&hash_item->entry, source, source_size); + if (source->astr) { + hash_item->entry.alen = source->alen; + if (hash_item->entry.alen > kMaxAffixLen) + hash_item->entry.alen = kMaxAffixLen; + memcpy(hash_item->astr, source->astr, hash_item->entry.alen * sizeof(hash_item->astr[0])); + hash_item->entry.astr = &hash_item->astr[0]; + } + return &hash_item->entry; +} + +} // namespace +#endif + + +#ifdef HUNSPELL_CHROME_CLIENT +SuggestMgr::SuggestMgr(hunspell::BDictReader* reader, + const char * tryme, int maxn, + AffixMgr * aptr) +{ + bdict_reader = reader; +#else SuggestMgr::SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr) { +#endif // register affix manager and check in string of chars to // try when building candidate suggestions pAMgr = aptr; @@ -409,6 +514,21 @@ int SuggestMgr::replchars(std::vector& wlst, int wl = strlen(word); if (wl < 2 || !pAMgr) return wlst.size(); + +// TODO: wrong, 'ns' doesn't exist any more +#ifdef HUNSPELL_CHROME_CLIENT + const char *pattern, *pattern2; + hunspell::ReplacementIterator iterator = bdict_reader->GetReplacementIterator(); + while (iterator.GetNext(&pattern, &pattern2)) { + const char* r = word; + size_t lenr = strlen(pattern2); + size_t lenp = strlen(pattern); + + // search every occurence of the pattern in the word + while ((r=strstr(r, pattern)) != NULL) { + candidate = word; + candidate.replace(r-word, lenp, pattern2); +#else const std::vector& reptable = pAMgr->get_reptable(); for (size_t i = 0; i < reptable.size(); ++i) { const char* r = word; @@ -428,6 +548,7 @@ int SuggestMgr::replchars(std::vector& wlst, candidate.resize(r - word); candidate.append(reptable[i].outstrings[type]); candidate.append(r + reptable[i].pattern.size()); +#endif testsug(wlst, candidate, cpdsuggest, NULL, NULL); // check REP suggestions with space size_t sp = candidate.find(' '); @@ -1047,6 +1168,9 @@ void SuggestMgr::ngsuggest(std::vector& wlst, struct hentry* hp = NULL; int col = -1; +#ifdef HUNSPELL_CHROME_CLIENT + ScopedHashEntryFactory hash_entry_factory; +#endif phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; std::string target; std::string candidate; @@ -1074,12 +1198,12 @@ void SuggestMgr::ngsuggest(std::vector& wlst, u8_u16(w_word, word); u8_u16(w_target, target); } - + std::vector w_entry; std::string f; std::vector w_f; std::vector w_target2; - + for (size_t i = 0; i < rHMgr.size(); ++i) { while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && @@ -1143,7 +1267,11 @@ void SuggestMgr::ngsuggest(std::vector& wlst, if (sc > scores[lp]) { scores[lp] = sc; +#ifdef HUNSPELL_CHROME_CLIENT + roots[lp] = hash_entry_factory.CreateScopedHashEntry(lp, hp); +#else roots[lp] = hp; +#endif lval = sc; for (int j = 0; j < MAX_ROOTS; j++) if (scores[j] < lval) { @@ -2058,8 +2186,8 @@ void SuggestMgr::lcs(const char* s, m = strlen(s); n = strlen(s2); } - c = (char*)malloc((m + 1) * (n + 1)); - b = (char*)malloc((m + 1) * (n + 1)); + c = (char *) calloc(m + 1, n + 1); + b = (char *) calloc(m + 1, n + 1); if (!c || !b) { if (c) free(c); @@ -2068,10 +2196,6 @@ void SuggestMgr::lcs(const char* s, *result = NULL; return; } - for (i = 1; i <= m; i++) - c[i * (n + 1)] = 0; - for (j = 0; j <= n; j++) - c[j] = 0; for (i = 1; i <= m; i++) { for (j = 1; j <= n; j++) { if (((utf8) && (su[i - 1] == su2[j - 1])) || diff --git a/src/hunspell/suggestmgr.hxx b/src/hunspell/suggestmgr.hxx index 6ba9dc8..0b80fac 100644 --- a/src/hunspell/suggestmgr.hxx +++ b/src/hunspell/suggestmgr.hxx @@ -124,7 +124,11 @@ class SuggestMgr { int complexprefixes; public: +#ifdef HUNSPELL_CHROME_CLIENT + SuggestMgr(hunspell::BDictReader* reader, const char * tryme, int maxn, AffixMgr *aptr); +#else SuggestMgr(const char* tryme, unsigned int maxn, AffixMgr* aptr); +#endif ~SuggestMgr(); void suggest(std::vector& slst, const char* word, int* onlycmpdsug); @@ -134,6 +138,10 @@ class SuggestMgr { std::string suggest_gen(const std::vector& pl, const std::string& pattern); private: +#ifdef HUNSPELL_CHROME_CLIENT + // Not owned by us, owned by the Hunspell object. + hunspell::BDictReader* bdict_reader; +#endif void testsug(std::vector& wlst, const std::string& candidate, int cpdsuggest,