From 2d2a6573fece0ba3e2f1b81838d576b2b58f030e Mon Sep 17 00:00:00 2001
From: Dorian Weber <dorian.weber@informatik.hu-berlin.de>
Date: Fri, 11 Jun 2021 18:36:50 +0200
Subject: [PATCH] Decompressor-tokenizer example in C added to the repository.

---
 c/Makefile              |  25 ++++++
 c/decompress-tokenize.c | 187 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 c/Makefile
 create mode 100644 c/decompress-tokenize.c

diff --git a/c/Makefile b/c/Makefile
new file mode 100644
index 0000000..63a6d73
--- /dev/null
+++ b/c/Makefile
@@ -0,0 +1,25 @@
+#!/usr/bin/make
+.SUFFIXES:
+.PHONY: all run clean
+
+TAR = decompress-tokenize
+SRC = $(wildcard *.c)
+OBJ = $(SRC:%.c=%.o)
+DEP = $(OBJ:%.o=%.d)
+-include $(DEP)
+
+CFLAGS = -std=c11 -Wall -pedantic -MMD -MP
+
+%.o: %.c
+	$(CC) $(CFLAGS) $< -c
+
+$(TAR): $(OBJ)
+	$(CC) $(CFLAGS) $^ -o $@
+
+all: $(TAR)
+
+run: all
+	./$(TAR)
+
+clean:
+	$(RM) $(RMFILES) $(TAR) $(OBJ) $(DEP)
diff --git a/c/decompress-tokenize.c b/c/decompress-tokenize.c
new file mode 100644
index 0000000..4fa51a6
--- /dev/null
+++ b/c/decompress-tokenize.c
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+/* Define either of them to enable the corresponding rewritten forms of the
+ * routines or both to have both of them rewritten. */
+#define DECOMPRESS_COROUTINE
+// #define TOKENIZE_COROUTINE
+
+/* Helper macro to simplify tracing of the function calls and messages. */
+#define TRACE(...) do { \
+	fprintf(stderr, __VA_ARGS__); \
+	putc('\n', stderr); \
+} while (0)
+
+/* Helper macro to catch array overflows for extreme inputs. */
+#define CATCH_OVERFLOW(ARR, LEN) do { \
+	if ((LEN) >= sizeof(ARR)/sizeof(*ARR)) { \
+		fprintf(stderr, "PANIC: Array " #ARR " overflow detected, abort\n"); \
+		exit(-1); \
+	} \
+} while (0)
+
+/* Enumeration of possible token tags. */
+enum Tag { WORD, PUNCT };
+
+/* Names of the token tags. */
+static const char *TOKEN_TAG[] = {
+	[WORD] = "WORD", [PUNCT] = "PUNCT"
+};
+
+/* Token type with tag and value. */
+typedef struct {
+	enum Tag tag;
+	char val[256];
+	size_t len;
+} Token;
+
+/* Primitive token channel for buffering multiple detected tokens. */
+static struct {
+	Token token[256];
+	size_t len;
+} token_chan;
+
+/* Function that adds another character to the token value. */
+void add_to_token(char c) {
+	Token *token = &token_chan.token[token_chan.len];
+	CATCH_OVERFLOW(token->val, token->len);
+	token->val[token->len++] = c;
+}
+
+/* Function that adds the corresponding tag and closes token construction. */
+void got_token(enum Tag tag) {
+	CATCH_OVERFLOW(token_chan.token, token_chan.len);
+	Token *token = &token_chan.token[token_chan.len];
+	token->val[token->len] = '\0';
+	token->tag = tag;
+	TRACE("got_token(%s) = \"%s\"", TOKEN_TAG[tag], token->val);
+	++token_chan.len;
+}
+
+/* Stackless coroutine-version of the decompress-routine. */
+int co_decompress(void) {
+	static int pc, l, c;
+	switch (pc) {
+	case 0: while (1) {
+		c = getchar();
+		if (c == EOF)
+			return EOF;
+		if (c == 0xFF) {
+			l = getchar();
+			c = getchar();
+			while (l--) {
+				TRACE("nextchar() = '%c'", c);
+				pc = 1;
+				return c;
+	case 1:;
+			}
+		} else {
+			TRACE("nextchar() = '%c'", c);
+			pc = 2;
+			return c;
+	case 2:;
+		}
+	}}
+}
+
+/* Stackless coroutine-version of the tokenize-routine. */
+void co_tokenize(int c) {
+	static int pc = 1;
+	switch (pc) {
+	case 0: while (1) {
+		pc = 1;
+		return;
+	case 1:;
+		TRACE("emit('%c')", c);
+		if (c == EOF)
+			return;
+		if (isalpha(c)) {
+			do {
+				add_to_token(c);
+				pc = 2;
+				return;
+	case 2:;
+				TRACE("emit('%c')", c);
+			} while (isalpha(c));
+			got_token(WORD);
+		}
+		add_to_token(c);
+		got_token(PUNCT);
+	}}
+}
+
+/* Decodes RLE-encoded input and pushes it into the tokenizer coroutine. */
+void decompress(void) {
+	while (1) {
+		int c = getchar();
+		if (c == EOF)
+			break;
+		if (c == 0xFF) {
+			int len = getchar();
+			c = getchar();
+			while (len--) {
+				co_tokenize(c);
+			}
+		} else
+			co_tokenize(c);
+	}
+	co_tokenize(EOF);
+}
+
+/* Calls the decompressor-coroutine for decoding RLE-encoded input and
+ * constructs token. */
+void tokenize(void) {
+	while (1) {
+		int c = co_decompress();
+		if (c == EOF)
+			break;
+		if (isalpha(c)) {
+			do {
+				add_to_token(c);
+				c = co_decompress();
+			} while (isalpha(c));
+			got_token(WORD);
+		}
+		add_to_token(c);
+		got_token(PUNCT);
+	}
+}
+
+/* Prints all token currently present in the token channel. */
+void printToken(void) {
+	for (size_t i = 0; i < token_chan.len; ++i) {
+		Token *token = &token_chan.token[i];
+		TRACE(
+			"Token: {\n"
+			"\ttag: %s,\n"
+			"\tval: \"%s\"\n"
+			"}",
+			TOKEN_TAG[token->tag],
+			token->val
+		);
+		token->len = 0;
+	}
+	token_chan.len = 0;
+}
+
+/* Program entry. */
+int main() {
+	#if defined(TOKENIZE_COROUTINE) && defined(DECOMPRESS_COROUTINE)
+	fprintf(stderr, "Decompress Coroutine, Tokenize Coroutine\n");
+	for (int c; (c = co_decompress()) != EOF;) {
+		co_tokenize(c);
+		printToken();
+	}
+	#elif defined(TOKENIZE_COROUTINE)
+	fprintf(stderr, "Tokenize Routine, Decompress Coroutine\n");
+	tokenize();
+	#elif defined(DECOMPRESS_COROUTINE)
+	fprintf(stderr, "Decompress Routine, Tokenize Coroutine\n");
+	decompress();
+	#else
+	#error "At least one (or both) of TOKENIZE_COROUTINE or DECOMPRESS_COROUTINE should be defined."
+	#endif
+	
+	return 0;
+}
-- 
GitLab