diff options
Diffstat (limited to 'transtbl.cc')
-rw-r--r-- | transtbl.cc | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/transtbl.cc b/transtbl.cc new file mode 100644 index 0000000..6c4b3d1 --- /dev/null +++ b/transtbl.cc @@ -0,0 +1,136 @@ +// Copyright (C) 2003 Mooffie <mooffie@typo.co.il> +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + +#include <config.h> + +#include <stdio.h> +#include <errno.h> + +#include "transtbl.h" +#include "io.h" // set_last_error +#include "dbg.h" + +// Most of the code below deals with parsing a TranslationTable +// file. Such files consist of lines of the form: +// +// <character-from> <character-to> +// +// that map character-from to character-to. +// +// <character-xxx> can be in one of three forms: +// +// 1. ' literal-character ' +// 2. decimal-number . +// 3. hex-number +// +// Examples: +// +// 'a' 5d0 # maps 'a' to Hebrew letter Alef +// 'a' 1488. # the same +// 'a' 'b' # maps 'a' to 'b' +// +// literal-character is UTF-8 encoded. + + +// parse_next_char() - parses the next <character> token. (this is a +// misnomer, because one might think we mean C's "char".) +// +// If there was no lexical error, returns a pointer to the end of the +// token (so one can continue to parse the next token); else returns +// NULL. + +static char *parse_next_char(char *s, unichar &ch) +{ + while (*s == ' ' || *s == '\t') + s++; + if (!*s) + return NULL; + if (*s == '\'') { + s++; + char *end = strchr(s + 1, '\''); + if (!end) + return NULL; + unistring us; + us.init_from_utf8(s, end - s); + if (us.size() != 1) + return false; + ch = us[0]; + return end + 1; + } else { + char *end; + errno = 0; + int val = strtol(s, &end, 16); + if (*end == '.') { + *end = ' '; + val = strtol(s, &end, 10); + } + if (errno || (*end != '\0' && *end != ' ' && *end != '\t')) + return NULL; + ch = (unichar)val; + return end; + } +} + +// load(filename) - loads--that is, parse--a file. It reads the file line by +// line and for each line calls parse_next_char() to parse the two +// <character> tokens. It then adds the mapping to the map table. + +bool TranslationTable::load(const char *filename) +{ +#define MAX_LINE_LEN 1024 + charmap.clear(); + + FILE *fp = fopen(filename, "r"); + if (!fp) { + set_last_error(errno); + return false; + } + DBG(1, ("Reading translation table %s\n", filename)); + + char line[MAX_LINE_LEN]; + while (fgets(line, MAX_LINE_LEN, fp)) { + int len = strlen(line); + if (len && line[len-1] == '\n') + line[len-1] = 0; + if (strchr(line, '#')) // remove comment + *(strchr(line, '#')) = '\0'; + + unichar ch1, ch2; + char *s = line; + if ((s = parse_next_char(s, ch1))) + if ((s = parse_next_char(s, ch2))) + charmap[ch1] = ch2; + } + fclose(fp); + + return true; +#undef MAX_LINE_LEN +} + +// translate_char() - matches a character with another, in-place. returns +// false if no match exists. + +bool TranslationTable::translate_char(unichar &ch) const +{ + std::map<unichar, unichar>::const_iterator + it = charmap.find(ch); + if (it != charmap.end()) { + ch = it->second; + return true; + } else + return false; +} + |