Уведен лексер, још увек нестабилан и минималан

2019-01-06 12:30:07 +01:00 · 2019-01-06 12:30:07 +01:00 · 8d89aa622a
commit 8d89aa622a
parent 7cff23f9ca
5 changed files with 136 additions and 17 deletions
--- a/4
+++ b/4
@ -13,7 +13,7 @@ LDFLAGS = -lm -lc

 CC = cc

-SRC = cirilisp.c util.c
+SRC = cirilisp.c util.c lexer.c
 OBJ = $(SRC:.c=.o)

 all: cirilisp
@ -21,7 +21,7 @@ all: cirilisp
 .c.o:
 	$(CC) -c $(CFLAGS) $<

-$(OBJ): util.h
+$(OBJ): util.h lexer.h

 cirilisp: $(OBJ)
 	$(CC) -o $@ $(OBJ) $(LDFLAGS)
--- a/cirilisp.c
+++ b/cirilisp.c
@ -3,32 +3,38 @@
 #include <stdlib.h>

 #include "util.h"
+#include "lexer.h"

 int main(int argc, char **argv)
 {
-	// Омогућава библиотекама коришћеним у интерпретеру да протумаче
-	// српску ћирилицу
-	// struct lconv *loc;
-	setlocale(LC_ALL, "sr_RS.utf8");
-	// loc = localeconv();
-
-	// Бојлерплејт обавештења о окружењу
-	puts("Прост РЕПЛ:");
-	puts("Притисните ctrl+c да бисте изашли\n");
-
-	while (1)
+/* Омогућава библиотекама коришћеним у интерпретеру да протумаче српску ћирилицу */
+	if (setlocale(LC_ALL, "sr_RS.utf8") == NULL)
 	{
-		char *input = readline("k> ");
-		if (input == NULL)
-		{
-			putchar('\n');
+		fprintf(stderr, "locale couldn't be set to \"sr_RS.utf8\", check if you've enabled it on your system\n");
 		exit(0);
 	}

-		printf("НАПИСАЛИ СТЕ: %s\n", input);
+	while (1)
+	{
+		char *input = readline("Л> ");
+		if (input == NULL)
+		{
+			putchar('\n');
+			printf("Крај улазног тока.\n");
+			// Превод
+			exit(0);
+		}
+
+		token *tokenList, *current;
+		current = tokenList = lexLine(input);
+		int i = 0;
+		while (current != NULL)
+		{
+			printf("Токен бр. %d: \"%s\", тип:%s\n", i,
+current->lexeme, current->type == number ? "number" : (current->type == symbol 
+? "symbol" : "parenthesis"));
+		}

 		free(input);
 	}
-
-	return 0;
 }
--- a/lexer.c
+++ b/lexer.c
@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <regex.h>
+
+#include "lexer.h"
+
+token *lex1token(char *input, int *i);
+/* враћа показивач на једну token структуру, која означава један одређен токен,
+чита улазни стринг од i-тог карактера, и мења i тако да оно затим индексира
+следећи токен или крај стринга*/
+
+token *lexLine(char *input)
+{
+	int i = 0, n;
+	n = strlen(input);
+	token *root = NULL, **new;
+	new = &root;
+	while (i < n)
+	{
+		*new = lex1token(input, &i);
+		new = &((*new)->next);
+		if ((*new)->type == undefined)
+		{
+/* уколико се у реду нађе токен који је лексички погрешан, штампа се место тог
+токена у реду и брише се цела листа, функција враћа NULL*/
+			fprintf(stderr, "Невалидан токен на месту %d\n", i);
+			new = &root;
+			while (*new != NULL)
+			{
+				free(root->lexeme);
+				new = &((*new)->next);
+				free(root);
+				root = *new;
+			}
+			return NULL;
+		}
+	}
+	return root;
+}
+
+regex_t regNumber, regSymbol, regParenthesis, regSpace;
+
+token *lex1token(char *input, int *i)
+{
+	token *result = malloc(sizeof(token));
+	result->next = NULL;
+
+	regcomp(&regSpace, "^[:space:]*", REG_EXTENDED);
+	regcomp(&regNumber, "^[-+]?[:digit:]+", REG_EXTENDED);
+	regcomp(&regSymbol, "^[-+/*]", REG_EXTENDED);
+/* за сада подржава само симболе -, +, * и / */
+	regcomp(&regParenthesis, "^[()]", REG_EXTENDED);
+
+	const int nmatches = 1;
+	regmatch_t a[nmatches];
+
+	regexec(&regSpace, input + *i, nmatches, a, 0);
+	*i += a[0].rm_eo;
+/* помера индекс да би се игнорисали почетни "вајт-спејс" карактери */
+
+	if (!regexec(&regSymbol, input + *i, nmatches, a, 0))
+	{
+		result->type = symbol;
+	}
+	else if (!regexec(&regNumber, input + *i, nmatches, a, 0))
+	{
+		result->type = number;
+	}
+	else if (!regexec(&regParenthesis, input + *i, nmatches, a, 0))
+	{
+		result->type = parenthesis;
+	}
+	else
+	{
+		result->type = undefined;
+		goto skipStringCopy;
+	}
+	result->lexeme = malloc((a[0].rm_eo + 1) * sizeof(char));
+	strncpy(result->lexeme, input + *i, a[0].rm_eo);
+	result->lexeme[a[0].rm_eo] = '\0';
+	*i += a[0].rm_eo;
+
+	regexec(&regSpace, input + *i, nmatches, a, 0);
+	*i += a[0].rm_eo;
+/* игнорисање крајњих вајт-спејс карактера */
+
+	skipStringCopy:
+	regfree(&regSpace);
+	regfree(&regNumber);
+	regfree(&regSymbol);
+	regfree(&regParenthesis);
+
+	return result;
+}
--- a/lexer.h
+++ b/lexer.h
@ -0,0 +1,20 @@
+#pragma once
+
+typedef enum
+{
+	undefined,
+	number,
+	symbol,
+	parenthesis
+} tokenType ;
+
+typedef struct _Token
+{
+	tokenType type;
+	char *lexeme;
+	struct _Token *next;
+} token;
+
+/* функција lexLine као аргумент добија ред са стандардног улаза, а као излаз
+враћа лексичке елементе у повезаној листи */
+token *lexLine(char *input);
--- a/util.h
+++ b/util.h
@ -1,5 +1,3 @@
 #pragma once

 char *readline(char *prompt);
-
-void add_history(char *unused);