From 2d2a6573fece0ba3e2f1b81838d576b2b58f030e Mon Sep 17 00:00:00 2001 From: Dorian Weber <dorian.weber@informatik.hu-berlin.de> Date: Fri, 11 Jun 2021 18:36:50 +0200 Subject: [PATCH] Decompressor-tokenizer example in C added to the repository. --- c/Makefile | 25 ++++++ c/decompress-tokenize.c | 187 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 c/Makefile create mode 100644 c/decompress-tokenize.c diff --git a/c/Makefile b/c/Makefile new file mode 100644 index 0000000..63a6d73 --- /dev/null +++ b/c/Makefile @@ -0,0 +1,25 @@ +#!/usr/bin/make +.SUFFIXES: +.PHONY: all run clean + +TAR = decompress-tokenize +SRC = $(wildcard *.c) +OBJ = $(SRC:%.c=%.o) +DEP = $(OBJ:%.o=%.d) +-include $(DEP) + +CFLAGS = -std=c11 -Wall -pedantic -MMD -MP + +%.o: %.c + $(CC) $(CFLAGS) $< -c + +$(TAR): $(OBJ) + $(CC) $(CFLAGS) $^ -o $@ + +all: $(TAR) + +run: all + ./$(TAR) + +clean: + $(RM) $(RMFILES) $(TAR) $(OBJ) $(DEP) diff --git a/c/decompress-tokenize.c b/c/decompress-tokenize.c new file mode 100644 index 0000000..4fa51a6 --- /dev/null +++ b/c/decompress-tokenize.c @@ -0,0 +1,187 @@ +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +/* Define either of them to enable the corresponding rewritten forms of the + * routines or both to have both of them rewritten. */ +#define DECOMPRESS_COROUTINE +// #define TOKENIZE_COROUTINE + +/* Helper macro to simplify tracing of the function calls and messages. */ +#define TRACE(...) do { \ + fprintf(stderr, __VA_ARGS__); \ + putc('\n', stderr); \ +} while (0) + +/* Helper macro to catch array overflows for extreme inputs. */ +#define CATCH_OVERFLOW(ARR, LEN) do { \ + if ((LEN) >= sizeof(ARR)/sizeof(*ARR)) { \ + fprintf(stderr, "PANIC: Array " #ARR " overflow detected, abort\n"); \ + exit(-1); \ + } \ +} while (0) + +/* Enumeration of possible token tags. */ +enum Tag { WORD, PUNCT }; + +/* Names of the token tags. */ +static const char *TOKEN_TAG[] = { + [WORD] = "WORD", [PUNCT] = "PUNCT" +}; + +/* Token type with tag and value. */ +typedef struct { + enum Tag tag; + char val[256]; + size_t len; +} Token; + +/* Primitive token channel for buffering multiple detected tokens. */ +static struct { + Token token[256]; + size_t len; +} token_chan; + +/* Function that adds another character to the token value. */ +void add_to_token(char c) { + Token *token = &token_chan.token[token_chan.len]; + CATCH_OVERFLOW(token->val, token->len); + token->val[token->len++] = c; +} + +/* Function that adds the corresponding tag and closes token construction. */ +void got_token(enum Tag tag) { + CATCH_OVERFLOW(token_chan.token, token_chan.len); + Token *token = &token_chan.token[token_chan.len]; + token->val[token->len] = '\0'; + token->tag = tag; + TRACE("got_token(%s) = \"%s\"", TOKEN_TAG[tag], token->val); + ++token_chan.len; +} + +/* Stackless coroutine-version of the decompress-routine. */ +int co_decompress(void) { + static int pc, l, c; + switch (pc) { + case 0: while (1) { + c = getchar(); + if (c == EOF) + return EOF; + if (c == 0xFF) { + l = getchar(); + c = getchar(); + while (l--) { + TRACE("nextchar() = '%c'", c); + pc = 1; + return c; + case 1:; + } + } else { + TRACE("nextchar() = '%c'", c); + pc = 2; + return c; + case 2:; + } + }} +} + +/* Stackless coroutine-version of the tokenize-routine. */ +void co_tokenize(int c) { + static int pc = 1; + switch (pc) { + case 0: while (1) { + pc = 1; + return; + case 1:; + TRACE("emit('%c')", c); + if (c == EOF) + return; + if (isalpha(c)) { + do { + add_to_token(c); + pc = 2; + return; + case 2:; + TRACE("emit('%c')", c); + } while (isalpha(c)); + got_token(WORD); + } + add_to_token(c); + got_token(PUNCT); + }} +} + +/* Decodes RLE-encoded input and pushes it into the tokenizer coroutine. */ +void decompress(void) { + while (1) { + int c = getchar(); + if (c == EOF) + break; + if (c == 0xFF) { + int len = getchar(); + c = getchar(); + while (len--) { + co_tokenize(c); + } + } else + co_tokenize(c); + } + co_tokenize(EOF); +} + +/* Calls the decompressor-coroutine for decoding RLE-encoded input and + * constructs token. */ +void tokenize(void) { + while (1) { + int c = co_decompress(); + if (c == EOF) + break; + if (isalpha(c)) { + do { + add_to_token(c); + c = co_decompress(); + } while (isalpha(c)); + got_token(WORD); + } + add_to_token(c); + got_token(PUNCT); + } +} + +/* Prints all token currently present in the token channel. */ +void printToken(void) { + for (size_t i = 0; i < token_chan.len; ++i) { + Token *token = &token_chan.token[i]; + TRACE( + "Token: {\n" + "\ttag: %s,\n" + "\tval: \"%s\"\n" + "}", + TOKEN_TAG[token->tag], + token->val + ); + token->len = 0; + } + token_chan.len = 0; +} + +/* Program entry. */ +int main() { + #if defined(TOKENIZE_COROUTINE) && defined(DECOMPRESS_COROUTINE) + fprintf(stderr, "Decompress Coroutine, Tokenize Coroutine\n"); + for (int c; (c = co_decompress()) != EOF;) { + co_tokenize(c); + printToken(); + } + #elif defined(TOKENIZE_COROUTINE) + fprintf(stderr, "Tokenize Routine, Decompress Coroutine\n"); + tokenize(); + #elif defined(DECOMPRESS_COROUTINE) + fprintf(stderr, "Decompress Routine, Tokenize Coroutine\n"); + decompress(); + #else + #error "At least one (or both) of TOKENIZE_COROUTINE or DECOMPRESS_COROUTINE should be defined." + #endif + + return 0; +} -- GitLab