291 lines
6.5 KiB
C
291 lines
6.5 KiB
C
/* Tokenize words, letter by letter.
|
||
* Supports Latin characters.
|
||
* Compile with gcc
|
||
*
|
||
* (c) 2021 Sakuragasaki46
|
||
*/
|
||
|
||
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <errno.h>
|
||
#include <locale.h>
|
||
#include <wctype.h>
|
||
#include <ctype.h>
|
||
|
||
|
||
// user interface
|
||
const char HELP[] = "%s - words in a text file\n\
|
||
Prints a list of words in lowercase form, one per line.\n\
|
||
Supports UTF-8 strings.\n\
|
||
\n\
|
||
Usage: %s [filename]\n\
|
||
\n\
|
||
Arguments:\n\
|
||
\tfilename\t\tfilename to analyze (defaults to stdin)\n\
|
||
\t-s\t\tcase sensitive\n\
|
||
\t-n\t\tdon’t normalize non-ASCII characters\n\
|
||
\t--help\t\tshows this help message and exits\n\
|
||
\t--version\t\tshow version and exit\n";
|
||
const char VERSION[] = "0.2";
|
||
|
||
// behavior switches
|
||
int cASE_SENSITIVE = 0;
|
||
int nO_NORMALIZE = 0;
|
||
|
||
// table data for normCharcode()
|
||
// if character in first table is uppercase, data from second table
|
||
// is read at same position
|
||
|
||
// latin-1
|
||
const char ASCII_DATA_192[] =
|
||
"aaaaaaAceeeeiiiiDnooooo OuuuuyTSaaaaaaAceeeeiiiiDnooooo OuuuuyTy";
|
||
const char ASCII_DATA_192_B[] =
|
||
" e h e hs e h e h ";
|
||
|
||
// latin extended A
|
||
const char ASCII_DATA_U0100[] =
|
||
"aaaaaaccCCccCCddDDeeeeeeeeYYgggggggghhhhiiiiiiiiiiIIjjkkklllllll"
|
||
"lllnnnnnnnNNooooooOOrrrrrrssSSSSSSTTttttuuuuuuOOuuuuwwyyyZZzzZZs";
|
||
const char ASCII_DATA_U0100_B[] =
|
||
" hh hh jj ee jj "
|
||
" gg ee hhhhhhss uu hh hh ";
|
||
|
||
|
||
typedef struct string_linkedlist_s {
|
||
char * s;
|
||
size_t len;
|
||
size_t bufsize;
|
||
//struct string_linkedlist_s *next;
|
||
} StringLL;
|
||
|
||
int main_file(const char* filename);
|
||
int main_stdin(void);
|
||
int tok_words(FILE *fr);
|
||
int readCharcode(FILE* fh);
|
||
int readWagonChar(FILE *fh);
|
||
int normCharcode(char *s, size_t pos, size_t offset, const char *table1, const char *table2);
|
||
|
||
StringLL* StringLL_new();
|
||
StringLL* StringLL_grow(StringLL*);
|
||
StringLL* StringLL_appendchar(StringLL*, int);
|
||
StringLL* StringLL_next_print(StringLL*);
|
||
void StringLL_destroy(StringLL*);
|
||
|
||
int main(int argc, char *argv[]) {
|
||
int curarg = 1;
|
||
|
||
// set locale, otherwise towlower() does not work
|
||
setlocale(LC_ALL, "en_US.UTF-8");
|
||
|
||
if (argc == 1) {
|
||
return main_stdin();
|
||
}
|
||
while (curarg < argc){
|
||
if (!strcmp(argv[1], "--help")) {
|
||
printf(HELP, argv[0], argv[0]);
|
||
exit(0);
|
||
} else if (!strcmp(argv[curarg], "--version")) {
|
||
puts(VERSION);
|
||
exit(0);
|
||
} else if (!strcmp(argv[curarg], "-s")) {
|
||
cASE_SENSITIVE = 1;
|
||
} else if (!strcmp(argv[curarg], "-n")) {
|
||
nO_NORMALIZE = 1;
|
||
} else if (!strcmp(argv[curarg], "-") || argv[curarg][0] == '\0') {
|
||
return main_stdin();
|
||
} else if (strncmp(argv[curarg], "-", 1)) {
|
||
return main_file(argv[curarg]);
|
||
} else {
|
||
fprintf(stderr, "Unknown option: \"%s\"\n", argv[curarg]);
|
||
return 1;
|
||
}
|
||
}
|
||
|
||
return 0;
|
||
};
|
||
|
||
int main_file(const char* filename){
|
||
FILE *fh;
|
||
|
||
fh = fopen(filename, "r");
|
||
if (!fh) {
|
||
fprintf(stderr, "[Errno %d] Could not open file \"%s\"\n", errno, filename);
|
||
exit(1);
|
||
}
|
||
|
||
return tok_words(fh);
|
||
}
|
||
|
||
int main_stdin(void){
|
||
return tok_words(stdin);
|
||
}
|
||
|
||
/**
|
||
* Main activity.
|
||
*
|
||
* @param fh a read-only file handle
|
||
*/
|
||
int tok_words(FILE *fh){
|
||
int charcode = 0, intoword = 0;
|
||
StringLL *lend;
|
||
|
||
lend = StringLL_new();
|
||
|
||
while ((charcode = readCharcode(fh)) >= 0) {
|
||
if (iswalpha(charcode)){
|
||
intoword++;
|
||
|
||
// locale lower case
|
||
if (!cASE_SENSITIVE) {
|
||
charcode = towlower(charcode);
|
||
}
|
||
|
||
lend = StringLL_appendchar(lend, charcode);
|
||
} else if (intoword > 0) {
|
||
intoword = 0;
|
||
|
||
lend = StringLL_next_print(lend);
|
||
}
|
||
}
|
||
|
||
if (intoword){
|
||
lend = StringLL_next_print(lend);
|
||
}
|
||
|
||
StringLL_destroy(lend);
|
||
return 0;
|
||
}
|
||
|
||
/**
|
||
* Read an UTF-8 character, and return its code.
|
||
* Returns a non-negative value on success,
|
||
* -1 on EOF.
|
||
*/
|
||
int readCharcode(FILE* fh){
|
||
int c, c2;
|
||
|
||
c = fgetc(fh);
|
||
if (c < 0) return -1;
|
||
if (0 <= c && c < 128) return c;
|
||
else if (192 <= c && c < 224){
|
||
c -= 192;
|
||
c *= 64;
|
||
c2 = readWagonChar(fh);
|
||
if (c2 < 0) return 0;
|
||
c += c2;
|
||
return c;
|
||
} else if (224 <= c && c < 240) {
|
||
c -= 224;
|
||
c *= 64;
|
||
c2 = readWagonChar(fh);
|
||
if (c2 < 0) return 0;
|
||
c += c2;
|
||
c *= 64;
|
||
c2 = readWagonChar(fh);
|
||
if (c2 < 0) return 0;
|
||
c += c2;
|
||
return c;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
int readWagonChar(FILE * fh){
|
||
int c;
|
||
|
||
c = fgetc(fh);
|
||
if (c < 128 || c >= 192) return -1;
|
||
return c - 128;
|
||
}
|
||
|
||
int normCharcode(char * s, size_t pos, size_t offset, const char *table1, const char *table2){
|
||
char c1;
|
||
|
||
// if character in first table is uppercase, data from second table
|
||
// is read at same position
|
||
c1 = table1[offset];
|
||
if (c1 == ' '){
|
||
return 0;
|
||
} else if (isupper(c1)) {
|
||
s[pos++] = tolower(c1);
|
||
s[pos++] = table2[offset];
|
||
return 2;
|
||
} else {
|
||
s[pos++] = c1;
|
||
return 1;
|
||
}
|
||
}
|
||
|
||
/***** StringLL functions *******/
|
||
|
||
|
||
StringLL* StringLL_new () {
|
||
StringLL* l;
|
||
|
||
l = (StringLL*) malloc (sizeof(StringLL));
|
||
l->bufsize = 16;
|
||
l->s = (char *) malloc(l->bufsize);
|
||
l->len = 0;
|
||
|
||
return l;
|
||
}
|
||
|
||
StringLL* StringLL_grow (StringLL* l){
|
||
l->bufsize *= 2;
|
||
l->s = (char*) realloc(l->s, l->bufsize);
|
||
return l;
|
||
}
|
||
|
||
StringLL* StringLL_appendchar(StringLL* l, int c){
|
||
if (c == 0) {
|
||
return l;
|
||
}
|
||
|
||
if (l->bufsize - l->len <= 4){
|
||
l = StringLL_grow(l);
|
||
}
|
||
|
||
if (c < 128){
|
||
// ascii
|
||
l->s[l->len++] = (char) c;
|
||
} else if (!nO_NORMALIZE && 192 <= c && c < 256) {
|
||
// latin-1 supplement
|
||
l->len += normCharcode(l->s, l->len, c - 192, ASCII_DATA_192, ASCII_DATA_192_B);
|
||
} else if (!nO_NORMALIZE && 256 <= c && c < 384) {
|
||
// latin extended-A
|
||
l->len += normCharcode(l->s, l->len, c - 256, ASCII_DATA_U0100, ASCII_DATA_U0100_B);
|
||
} else if (c < 0x800) {
|
||
// 2 byte UTF-8
|
||
l->s[l->len++] = (char) (c / 64) | 192;
|
||
l->s[l->len++] = (char) (c % 64) | 128;
|
||
} else if (c < 0x10000) {
|
||
// 3 byte UTF-8
|
||
l->s[l->len++] = (char) (c / 0x1000) | 224;
|
||
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
|
||
l->s[l->len++] = (char) (c % 64) | 128;
|
||
} else {
|
||
// 4-byte UTF-8
|
||
l->s[l->len++] = (char) (c / 0x40000) | 240;
|
||
l->s[l->len++] = (char) (c % 0x40000 / 0x1000) | 128;
|
||
l->s[l->len++] = (char) (c % 0x1000 / 64) | 128;
|
||
l->s[l->len++] = (char) (c / 64) | 128;
|
||
}
|
||
return l;
|
||
}
|
||
|
||
StringLL* StringLL_next_print (StringLL *l){
|
||
StringLL *next;
|
||
l->s[l->len] = 0;
|
||
printf("%s\n", l->s);
|
||
next = StringLL_new();
|
||
free(l->s);
|
||
free(l);
|
||
return next;
|
||
}
|
||
|
||
void StringLL_destroy (StringLL *l){
|
||
free(l->s);
|
||
free(l);
|
||
}
|
||
|