From 8d89aa622aa51b915cb936a060d3c7cb58af4a80 Mon Sep 17 00:00:00 2001
From: kappa <kappa10@tutanota.com>
Date: Sun, 6 Jan 2019 12:30:07 +0100
Subject: [PATCH] =?UTF-8?q?=D0=A3=D0=B2=D0=B5=D0=B4=D0=B5=D0=BD=20=D0=BB?=
 =?UTF-8?q?=D0=B5=D0=BA=D1=81=D0=B5=D1=80,=20=D1=98=D0=BE=D1=88=20=D1=83?=
 =?UTF-8?q?=D0=B2=D0=B5=D0=BA=20=D0=BD=D0=B5=D1=81=D1=82=D0=B0=D0=B1=D0=B8?=
 =?UTF-8?q?=D0=BB=D0=B0=D0=BD=20=D0=B8=20=D0=BC=D0=B8=D0=BD=D0=B8=D0=BC?=
 =?UTF-8?q?=D0=B0=D0=BB=D0=B0=D0=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Makefile   |  4 +--
 cirilisp.c | 32 ++++++++++--------
 lexer.c    | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lexer.h    | 20 ++++++++++++
 util.h     |  2 --
 5 files changed, 136 insertions(+), 17 deletions(-)
 create mode 100644 lexer.c
 create mode 100644 lexer.h

diff --git a/Makefile b/Makefile
index 7e44317..58d3b59 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ LDFLAGS = -lm -lc
 
 CC = cc
 
-SRC = cirilisp.c util.c
+SRC = cirilisp.c util.c lexer.c
 OBJ = $(SRC:.c=.o)
 
 all: cirilisp
@@ -21,7 +21,7 @@ all: cirilisp
 .c.o:
 	$(CC) -c $(CFLAGS) $<
 
-$(OBJ): util.h
+$(OBJ): util.h lexer.h
 
 cirilisp: $(OBJ)
 	$(CC) -o $@ $(OBJ) $(LDFLAGS)
diff --git a/cirilisp.c b/cirilisp.c
index 8e7c1b0..5a1526f 100644
--- a/cirilisp.c
+++ b/cirilisp.c
@@ -3,32 +3,38 @@
 #include <stdlib.h>
 
 #include "util.h"
+#include "lexer.h"
 
 int main(int argc, char **argv)
 {
-	// Омогућава библиотекама коришћеним у интерпретеру да протумаче
-	// српску ћирилицу
-	// struct lconv *loc;
-	setlocale(LC_ALL, "sr_RS.utf8");
-	// loc = localeconv();
-
-	// Бојлерплејт обавештења о окружењу
-	puts("Прост РЕПЛ:");
-	puts("Притисните ctrl+c да бисте изашли\n");
+/* Омогућава библиотекама коришћеним у интерпретеру да протумаче српску ћирилицу */
+	if (setlocale(LC_ALL, "sr_RS.utf8") == NULL)
+	{
+		fprintf(stderr, "locale couldn't be set to \"sr_RS.utf8\", check if you've enabled it on your system\n");
+		exit(0);
+	}
 
 	while (1)
 	{
-		char *input = readline("k> ");
+		char *input = readline("Л> ");
 		if (input == NULL)
 		{
 			putchar('\n');
+			printf("Крај улазног тока.\n");
+			// Превод
 			exit(0);
 		}
 
-		printf("НАПИСАЛИ СТЕ: %s\n", input);
+		token *tokenList, *current;
+		current = tokenList = lexLine(input);
+		int i = 0;
+		while (current != NULL)
+		{
+			printf("Токен бр. %d: \"%s\", тип:%s\n", i,
+current->lexeme, current->type == number ? "number" : (current->type == symbol 
+? "symbol" : "parenthesis"));
+		}
 
 		free(input);
 	}
-
-	return 0;
 }
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..93062b9
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <regex.h>
+
+#include "lexer.h"
+
+token *lex1token(char *input, int *i);
+/* враћа показивач на једну token структуру, која означава један одређен токен,
+чита улазни стринг од i-тог карактера, и мења i тако да оно затим индексира
+следећи токен или крај стринга*/
+
+token *lexLine(char *input)
+{
+	int i = 0, n;
+	n = strlen(input);
+	token *root = NULL, **new;
+	new = &root;
+	while (i < n)
+	{
+		*new = lex1token(input, &i);
+		new = &((*new)->next);
+		if ((*new)->type == undefined)
+		{
+/* уколико се у реду нађе токен који је лексички погрешан, штампа се место тог
+токена у реду и брише се цела листа, функција враћа NULL*/
+			fprintf(stderr, "Невалидан токен на месту %d\n", i);
+			new = &root;
+			while (*new != NULL)
+			{
+				free(root->lexeme);
+				new = &((*new)->next);
+				free(root);
+				root = *new;
+			}
+			return NULL;
+		}
+	}
+	return root;
+}
+
+regex_t regNumber, regSymbol, regParenthesis, regSpace;
+
+token *lex1token(char *input, int *i)
+{
+	token *result = malloc(sizeof(token));
+	result->next = NULL;
+
+	regcomp(&regSpace, "^[:space:]*", REG_EXTENDED);
+	regcomp(&regNumber, "^[-+]?[:digit:]+", REG_EXTENDED);
+	regcomp(&regSymbol, "^[-+/*]", REG_EXTENDED);
+/* за сада подржава само симболе -, +, * и / */
+	regcomp(&regParenthesis, "^[()]", REG_EXTENDED);
+
+	const int nmatches = 1;
+	regmatch_t a[nmatches];
+
+	regexec(&regSpace, input + *i, nmatches, a, 0);
+	*i += a[0].rm_eo;
+/* помера индекс да би се игнорисали почетни "вајт-спејс" карактери */
+
+	if (!regexec(&regSymbol, input + *i, nmatches, a, 0))
+	{
+		result->type = symbol;
+	}
+	else if (!regexec(&regNumber, input + *i, nmatches, a, 0))
+	{
+		result->type = number;
+	}
+	else if (!regexec(&regParenthesis, input + *i, nmatches, a, 0))
+	{
+		result->type = parenthesis;
+	}
+	else
+	{
+		result->type = undefined;
+		goto skipStringCopy;
+	}
+	result->lexeme = malloc((a[0].rm_eo + 1) * sizeof(char));
+	strncpy(result->lexeme, input + *i, a[0].rm_eo);
+	result->lexeme[a[0].rm_eo] = '\0';
+	*i += a[0].rm_eo;
+
+	regexec(&regSpace, input + *i, nmatches, a, 0);
+	*i += a[0].rm_eo;
+/* игнорисање крајњих вајт-спејс карактера */
+
+	skipStringCopy:
+	regfree(&regSpace);
+	regfree(&regNumber);
+	regfree(&regSymbol);
+	regfree(&regParenthesis);
+
+	return result;
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644
index 0000000..944f869
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,20 @@
+#pragma once
+
+typedef enum
+{
+	undefined,
+	number,
+	symbol,
+	parenthesis
+} tokenType ;
+
+typedef struct _Token
+{
+	tokenType type;
+	char *lexeme;
+	struct _Token *next;
+} token;
+
+/* функција lexLine као аргумент добија ред са стандардног улаза, а као излаз
+враћа лексичке елементе у повезаној листи */
+token *lexLine(char *input);
diff --git a/util.h b/util.h
index d0d1374..0a9f157 100644
--- a/util.h
+++ b/util.h
@@ -1,5 +1,3 @@
 #pragma once
 
 char *readline(char *prompt);
-
-void add_history(char *unused);