Skip to content
Snippets Groups Projects
Commit 2d2a6573 authored by Dorian Weber's avatar Dorian Weber
Browse files

Decompressor-tokenizer example in C added to the repository.

parent e4e01a0c
No related merge requests found
#!/usr/bin/make
.SUFFIXES:
.PHONY: all run clean
TAR = decompress-tokenize
SRC = $(wildcard *.c)
OBJ = $(SRC:%.c=%.o)
DEP = $(OBJ:%.o=%.d)
-include $(DEP)
CFLAGS = -std=c11 -Wall -pedantic -MMD -MP
%.o: %.c
$(CC) $(CFLAGS) $< -c
$(TAR): $(OBJ)
$(CC) $(CFLAGS) $^ -o $@
all: $(TAR)
run: all
./$(TAR)
clean:
$(RM) $(RMFILES) $(TAR) $(OBJ) $(DEP)
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
/* Define either of them to enable the corresponding rewritten forms of the
* routines or both to have both of them rewritten. */
#define DECOMPRESS_COROUTINE
// #define TOKENIZE_COROUTINE
/* Helper macro to simplify tracing of the function calls and messages. */
#define TRACE(...) do { \
fprintf(stderr, __VA_ARGS__); \
putc('\n', stderr); \
} while (0)
/* Helper macro to catch array overflows for extreme inputs. */
#define CATCH_OVERFLOW(ARR, LEN) do { \
if ((LEN) >= sizeof(ARR)/sizeof(*ARR)) { \
fprintf(stderr, "PANIC: Array " #ARR " overflow detected, abort\n"); \
exit(-1); \
} \
} while (0)
/* Enumeration of possible token tags. */
enum Tag { WORD, PUNCT };
/* Names of the token tags. */
static const char *TOKEN_TAG[] = {
[WORD] = "WORD", [PUNCT] = "PUNCT"
};
/* Token type with tag and value. */
typedef struct {
enum Tag tag;
char val[256];
size_t len;
} Token;
/* Primitive token channel for buffering multiple detected tokens. */
static struct {
Token token[256];
size_t len;
} token_chan;
/* Function that adds another character to the token value. */
void add_to_token(char c) {
Token *token = &token_chan.token[token_chan.len];
CATCH_OVERFLOW(token->val, token->len);
token->val[token->len++] = c;
}
/* Function that adds the corresponding tag and closes token construction. */
void got_token(enum Tag tag) {
CATCH_OVERFLOW(token_chan.token, token_chan.len);
Token *token = &token_chan.token[token_chan.len];
token->val[token->len] = '\0';
token->tag = tag;
TRACE("got_token(%s) = \"%s\"", TOKEN_TAG[tag], token->val);
++token_chan.len;
}
/* Stackless coroutine-version of the decompress-routine. */
int co_decompress(void) {
static int pc, l, c;
switch (pc) {
case 0: while (1) {
c = getchar();
if (c == EOF)
return EOF;
if (c == 0xFF) {
l = getchar();
c = getchar();
while (l--) {
TRACE("nextchar() = '%c'", c);
pc = 1;
return c;
case 1:;
}
} else {
TRACE("nextchar() = '%c'", c);
pc = 2;
return c;
case 2:;
}
}}
}
/* Stackless coroutine-version of the tokenize-routine. */
void co_tokenize(int c) {
static int pc = 1;
switch (pc) {
case 0: while (1) {
pc = 1;
return;
case 1:;
TRACE("emit('%c')", c);
if (c == EOF)
return;
if (isalpha(c)) {
do {
add_to_token(c);
pc = 2;
return;
case 2:;
TRACE("emit('%c')", c);
} while (isalpha(c));
got_token(WORD);
}
add_to_token(c);
got_token(PUNCT);
}}
}
/* Decodes RLE-encoded input and pushes it into the tokenizer coroutine. */
void decompress(void) {
while (1) {
int c = getchar();
if (c == EOF)
break;
if (c == 0xFF) {
int len = getchar();
c = getchar();
while (len--) {
co_tokenize(c);
}
} else
co_tokenize(c);
}
co_tokenize(EOF);
}
/* Calls the decompressor-coroutine for decoding RLE-encoded input and
* constructs token. */
void tokenize(void) {
while (1) {
int c = co_decompress();
if (c == EOF)
break;
if (isalpha(c)) {
do {
add_to_token(c);
c = co_decompress();
} while (isalpha(c));
got_token(WORD);
}
add_to_token(c);
got_token(PUNCT);
}
}
/* Prints all token currently present in the token channel. */
void printToken(void) {
for (size_t i = 0; i < token_chan.len; ++i) {
Token *token = &token_chan.token[i];
TRACE(
"Token: {\n"
"\ttag: %s,\n"
"\tval: \"%s\"\n"
"}",
TOKEN_TAG[token->tag],
token->val
);
token->len = 0;
}
token_chan.len = 0;
}
/* Program entry. */
int main() {
#if defined(TOKENIZE_COROUTINE) && defined(DECOMPRESS_COROUTINE)
fprintf(stderr, "Decompress Coroutine, Tokenize Coroutine\n");
for (int c; (c = co_decompress()) != EOF;) {
co_tokenize(c);
printToken();
}
#elif defined(TOKENIZE_COROUTINE)
fprintf(stderr, "Tokenize Routine, Decompress Coroutine\n");
tokenize();
#elif defined(DECOMPRESS_COROUTINE)
fprintf(stderr, "Decompress Routine, Tokenize Coroutine\n");
decompress();
#else
#error "At least one (or both) of TOKENIZE_COROUTINE or DECOMPRESS_COROUTINE should be defined."
#endif
return 0;
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment