coroutines.c

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

/* Define either of them to enable the corresponding rewritten forms of the
 * routines or both to have both of them rewritten. */
#define DECOMPRESS_COROUTINE
// #define TOKENIZE_COROUTINE

/* Helper macro to simplify tracing of the function calls and messages. */
#define TRACE(...) do { \
	fprintf(stderr, __VA_ARGS__); \
	putc('\n', stderr); \
} while (0)

/* Helper macro to catch array overflows for extreme inputs. */
#define CATCH_OVERFLOW(ARR, LEN) do { \
	if ((LEN) >= sizeof(ARR)/sizeof(*ARR)) { \
		fprintf(stderr, "PANIC: Array " #ARR " overflow detected, abort\n"); \
		exit(-1); \
	} \
} while (0)

/* Enumeration of possible token tags. */
enum Tag { WORD, PUNCT };

/* Names of the token tags. */
static const char *TOKEN_TAG[] = {
	[WORD] = "Word", [PUNCT] = "Punct"
};

/* Token type with tag and value. */
typedef struct {
	enum Tag tag;
	char val[256];
	size_t len;
} Token;

/* Primitive token channel for buffering multiple detected tokens. */
static struct {
	Token token[256];
	size_t len;
} token_chan;

/* Function that adds another character to the token value. */
void add_to_token(char c) {
	Token *token = &token_chan.token[token_chan.len];
	CATCH_OVERFLOW(token->val, token->len);
	token->val[token->len++] = c;
}

/* Function that adds the corresponding tag and closes token construction. */
void got_token(enum Tag tag) {
	CATCH_OVERFLOW(token_chan.token, token_chan.len);
	Token *token = &token_chan.token[token_chan.len];
	token->val[token->len] = '\0';
	token->tag = tag;
	TRACE("got_token(%s) = \"%s\"", TOKEN_TAG[tag], token->val);
	++token_chan.len;
}

/* Stackless coroutine-version of the decompress-routine. */
int co_decompress(void) {
	static int pc, l, c;
	switch (pc) {
	case 0: while (1) {
		c = getchar();
		if (c == EOF)
			return EOF;
		if (c == 0xFF) {
			l = getchar();
			c = getchar();
			while (l--) {
				TRACE("nextchar() = '%c'", c);
				pc = 1;
				return c;
	case 1:;
			}
		} else {
			TRACE("nextchar() = '%c'", c);
			pc = 2;
			return c;
	case 2:;
		}
	}}
}

/* Stackless coroutine-version of the tokenize-routine. */
void co_tokenize(int c) {
	static int pc = 1;
	switch (pc) {
	case 0: while (1) {
		pc = 1;
		return;
	case 1:;
		TRACE("emit('%c')", c);
		if (c == EOF)
			return;
		if (isalpha(c)) {
			do {
				add_to_token(c);
				pc = 2;
				return;
	case 2:;
				TRACE("emit('%c')", c);
			} while (isalpha(c));
			got_token(WORD);
		}
		add_to_token(c);
		got_token(PUNCT);
	}}
}

/* Decodes RLE-encoded input and pushes it into the tokenizer coroutine. */
void decompress(void) {
	while (1) {
		int c = getchar();
		if (c == EOF)
			break;
		if (c == 0xFF) {
			int l = getchar();
			c = getchar();
			while (l--) {
				co_tokenize(c);
			}
		} else
			co_tokenize(c);
	}
	co_tokenize(EOF);
}

/* Calls the decompressor-coroutine for decoding RLE-encoded input and
 * constructs token. */
void tokenize(void) {
	while (1) {
		int c = co_decompress();
		if (c == EOF)
			break;
		if (isalpha(c)) {
			do {
				add_to_token(c);
				c = co_decompress();
			} while (isalpha(c));
			got_token(WORD);
		}
		add_to_token(c);
		got_token(PUNCT);
	}
}

/* Prints all token currently present in the token channel. */
void printToken(void) {
	for (size_t i = 0; i < token_chan.len; ++i) {
		Token *token = &token_chan.token[i];
		TRACE(
			"Token: {\n"
			"\ttag: %s,\n"
			"\tval: \"%s\"\n"
			"}",
			TOKEN_TAG[token->tag],
			token->val
		);
		token->len = 0;
	}
	token_chan.len = 0;
}

/* Program entry. */
int main() {
	#if defined(TOKENIZE_COROUTINE) && defined(DECOMPRESS_COROUTINE)
	fprintf(stderr, "Decompress Coroutine, Tokenize Coroutine\n");
	for (int c; (c = co_decompress()) != EOF;) {
		co_tokenize(c);
		printToken();
	}
	#elif defined(TOKENIZE_COROUTINE)
	fprintf(stderr, "Tokenize Routine, Decompress Coroutine\n");
	tokenize();
	#elif defined(DECOMPRESS_COROUTINE)
	fprintf(stderr, "Decompress Routine, Tokenize Coroutine\n");
	decompress();
	#else
	#error "At least one (or both) of TOKENIZE_COROUTINE or DECOMPRESS_COROUTINE should be defined."
	#endif
	
	return 0;
}