Initial commit

2025-11-17 10:28:09 +01:00
parent 7bff81691f
commit 6ee36e26be
391 changed files with 110253 additions and 0 deletions
--- a/source/lexer.c
+++ b/source/lexer.c
@@ -0,0 +1,593 @@
+
+#include <lexer.h>
+
+#include <replacement.h>
+
+#include <ctype.h>
+
+#include <stdio.h>
+
+#include <stdint.h>
+
+
+
+#define ARRAY_SIZE( arr ) ( sizeof( ( arr ) ) / sizeof( ( arr )[0] ) )
+
+
+
+lexer * lexer_new()
+{
+	struct lexer * v;
+
+	v = ( struct lexer * ) malloc( sizeof( struct lexer ) );
+
+
+	v->capacity 	= ARRAY_INIT_CAPACITY;
+
+	v->total 		= 0;
+
+	v->keys 		= malloc(sizeof( int ) * v->capacity );
+
+	v->tokens 		= malloc(sizeof( char * ) * v->capacity );
+
+
+	return v;
+}
+
+int lexer_length( lexer * v )
+{
+
+    return v->total;
+
+}
+
+void lexer_resize( lexer * currentLexer, int capacity )
+{
+
+	int * keys 			= realloc( currentLexer->keys, sizeof( int ) * capacity );
+
+	char * * tokens 	= realloc( currentLexer->keys, sizeof( char * ) * capacity );
+
+	currentLexer->keys 		= keys;
+
+	currentLexer->tokens 	= tokens;
+
+	currentLexer->capacity 	= capacity;
+
+}
+
+void lexer_add( lexer * currentLexer, int key, char * token  )
+{
+	if ( currentLexer->capacity == currentLexer->total ){
+
+		lexer_resize( currentLexer, currentLexer->capacity * 2 );
+
+	}
+
+    currentLexer->keys[ currentLexer->total ] 			= key;
+
+    currentLexer->tokens[ currentLexer->total ] 		= token;
+
+    currentLexer->total++;
+
+}
+
+char * lexer_getToken( lexer * currentLexer, int index ) {
+
+	return currentLexer->tokens[ index ];
+
+}
+
+int lexer_getKey( lexer * currentLexer, int index ) {
+
+	return currentLexer->keys[ index ];
+
+}
+
+void lexer_setIndex( lexer * currentLexer, int index ) {
+
+	currentLexer->index	= index;
+
+}
+
+int lexer_tokenize( lexer * currentLexer, char * haystack, char * needle )
+{
+	int count 		= 0;
+
+	char *	tmp 	= haystack;
+
+	while( tmp = strstr( tmp, needle ) ) {
+
+		int key = (int)( tmp - haystack );
+
+		lexer_add( currentLexer, key, needle );
+
+		tmp++;
+
+		++count;
+	}
+
+	return count;
+}
+
+int lexer_findPreviousKeyByToken( lexer * currentLexer, int fromKey, char * token ) {
+
+	for (int i = fromKey - 1; i >= 0; --i)
+	{
+		int key				= lexer_getKey( currentLexer, i );
+
+		char * currentToken = lexer_getToken( currentLexer, i );
+
+		if ( strcmp( currentToken, token ) == 0 ) {
+
+			return key;
+
+		}
+
+	}
+
+	return -1;
+
+}
+
+int lexer_findPreviousTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
+
+	for (int i = fromKey - 1; i >= 0; --i)
+	{
+		int key				= lexer_getKey( currentLexer, i );
+
+		char * currentToken = lexer_getToken( currentLexer, i );
+
+		if ( strcmp( currentToken, token ) == 0 ) {
+
+			return i;
+
+		}
+
+	}
+
+	return -1;
+
+}
+
+void lexer_sortKeys( lexer * currentLexer ) {
+
+	int count 	= lexer_length( currentLexer );
+
+	int * keys			= currentLexer->keys;
+
+	char * * tokens		= currentLexer->tokens;
+
+
+	int i, j;
+
+	int temp;
+
+	char * temp2;
+
+	for (i = 0; i < (count - 1); ++i)
+	{
+		for (j = 0; j < count - 1 - i; ++j )
+		{
+			int a = keys[j];
+
+			int b = keys[j+1];
+
+			if ( a > b )
+			{
+
+				temp 							= keys[j+1];
+				temp2 							= tokens[j+1];
+
+				currentLexer->keys[j+1] 		= keys[j];
+				currentLexer->keys[j] 			= temp;
+
+				currentLexer->tokens[j+1] 		= tokens[j];
+				currentLexer->tokens[j] 		= temp2;
+
+			}
+
+		}
+
+	}
+
+}
+
+
+int lexer_findNextToken( lexer * currentLexer, int fromKey, char * token ) {
+
+	int count = lexer_length( currentLexer );
+
+	for (int i = fromKey; i < count; ++i)
+	{
+		int key				= lexer_getKey( currentLexer, i );
+
+		char * currentToken = lexer_getToken( currentLexer, i );
+
+		if ( strcmp( currentToken, token ) == 0 ) {
+
+			return key;
+
+		}
+
+	}
+
+	return -1;
+
+}
+
+int lexer_findNextTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
+
+	int count = lexer_length( currentLexer );
+
+	for (int i = fromKey; i < count; ++i)
+	{
+		int key					= lexer_getKey( currentLexer, i );
+
+		char * currentToken 	= lexer_getToken( currentLexer, i );
+
+		if ( strcmp( currentToken, token ) == 0 ) {
+
+			return i;
+
+		}
+
+	}
+
+	return -1;
+
+}
+
+
+// change name to something it does.
+int lexer_findPreviouseTokenIndex( lexer * currentLexer, int fromIndex, struct array * validPreviousTokens ) {
+
+	//int count = array_length( currentLexer );
+
+
+	for (int i = fromIndex - 1; i >= 0; --i)
+	{
+
+		char * currentToken 	= lexer_getToken( currentLexer, i );
+
+		int key 				= lexer_getKey( currentLexer, i );
+
+		int validTokenCount		= array_length( validPreviousTokens );
+
+		for (int j = 0; j < validTokenCount; ++j)
+		{
+				
+			char * currentValidToken	= array_get( validPreviousTokens, j );
+
+			//printf("find previous token: %s   %s \n", currentToken, currentValidToken);
+
+			if ( strcmp( currentToken, currentValidToken ) == 0 )
+			{
+
+				//printf("token found!!\n\n");
+
+				return key;
+
+			}
+
+		}
+
+	}
+
+	
+	return -1;
+
+}
+
+int lexer_tokenizeRegex( lexer * currentLexer, char * haystack, char * needle, char * token ) {
+
+	const char * 	error;
+
+	unsigned char *	name_table;
+
+	unsigned int 	option_bits;
+
+	int 	erroffset;
+
+	int 	crlf_is_newline;
+
+	int 	ovector[OVECCOUNT];
+
+	int 	subject_length;
+
+	int 	rc;
+
+	int 	utf8;
+
+
+
+	char * subject 			= text_copy( haystack );
+
+	subject_length 			= ( int ) strlen( subject );
+
+	pcre * re 				= pcre_compile(	needle,              /* the needle */
+											0,                    /* default options */
+											&error,               /* for error message */
+											&erroffset,           /* for error offset */
+											NULL);                /* use default character tables */
+
+
+	if ( re == NULL )
+	{
+
+		printf( "PCRE compilation failed at offset %d: %s\n", erroffset, error );
+
+		return 1;
+
+	}
+
+	rc = pcre_exec(	re,                   /* the compiled needle */
+					NULL,                 /* no extra data - we didn't study the needle */
+					subject,              /* the subject string */
+					subject_length,       /* the length of the subject */
+					0,                    /* start at offset 0 in the subject */
+					0,                    /* default options */
+					ovector,              /* output vector for substring information */
+					OVECCOUNT );          /* number of elements in the output vector */
+
+
+	if( rc < 0 ) {
+
+		switch( rc )
+		{
+			case PCRE_ERROR_NOMATCH: 
+
+				//printf("No match\n"); 
+
+			break;
+
+			default: 
+
+				printf("Matching error %d\n", rc); 
+
+			break;
+
+		}
+
+		pcre_free( re );     /* Release memory used for the compiled needle */
+
+		return 1;
+
+	} 
+
+
+	///printf("\nNew:			%d\n", ovector[0] + 1);
+
+	//lexer_add( currentLexer, ovector[0] + 1, token );
+
+
+		lexer_add( currentLexer, ovector[0]+1, token );
+
+
+
+	/* See if CRLF is a valid newline sequence. */
+
+	crlf_is_newline =	option_bits == PCRE_NEWLINE_ANY ||
+						option_bits == PCRE_NEWLINE_CRLF ||
+						option_bits == PCRE_NEWLINE_ANYCRLF;
+
+	/* Loop for second and subsequent matches */
+
+	for (;;)
+		{
+		int options = 0;                 /* Normally no options */
+		int start_offset = ovector[1];   /* Start at end of previous match */
+
+		/* If the previous match was for an empty string, we are finished if we are
+		at the end of the subject. Otherwise, arrange to run another match at the
+		same point to see if a non-empty match can be found. */
+
+		if (ovector[0] == ovector[1])
+		{
+		if (ovector[0] == subject_length) break;
+		options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
+		}
+
+		/* Run the next matching operation */
+
+		rc = pcre_exec(
+		re,                   /* the compiled needle */
+		NULL,                 /* no extra data - we didn't study the needle */
+		subject,              /* the subject string */
+		subject_length,       /* the length of the subject */
+		start_offset,         /* starting offset in the subject */
+		options,              /* options */
+		ovector,              /* output vector for substring information */
+		OVECCOUNT);           /* number of elements in the output vector */
+
+
+		if (rc == PCRE_ERROR_NOMATCH)
+		{
+			if (options == 0) 
+				break;                    				/* All matches found */
+
+			ovector[1] = start_offset + 1;              /* Advance one byte */
+
+			if (	crlf_is_newline &&                      /* If CRLF is newline & */
+					start_offset < subject_length - 1 &&    /* we are at CRLF, */
+					subject[start_offset] == '\r' &&
+					subject[start_offset + 1] == '\n') {
+
+				ovector[1] += 1;                          /* Advance by one more. */
+
+			} else if ( utf8 ) {                     /* Otherwise, ensure we */
+			                                      /* advance a whole UTF-8 */
+				while (ovector[1] < subject_length)       /* character. */
+				{
+					if ((subject[ovector[1]] & 0xc0) != 0x80) 
+						break;
+					ovector[1] += 1;
+				}
+
+			}
+
+			continue;    /* Go round the loop again */
+		}
+
+		/* Other matching errors are not recoverable. */
+
+		if (rc < 0)
+		{
+			printf("Matching error %d\n", rc);
+
+			pcre_free(re);    /* Release memory used for the compiled needle */
+
+			return 1;
+		}
+
+		/* Match succeeded */
+
+		//printf("\nNew:			%d\n", ovector[0]+1);
+
+		lexer_add( currentLexer, ovector[0]+1, token );
+
+
+
+
+		/* The match succeeded, but the output vector wasn't big enough. */
+	}
+
+	printf( "\n" );
+
+	pcre_free( re );       /* Release memory used for the compiled needle */
+
+	return 1;
+}
+
+
+void lexer_getTokens( lexer * currentLexer, char * source ) {
+
+	//printf("		lexer_getTokens\n\n");
+
+	lexer_tokenizeRegex( currentLexer, source, "\\sclass\\s", "class" );
+
+	lexer_tokenize( currentLexer, source, "{" );
+
+	lexer_tokenize( currentLexer, source, "}" );
+
+	lexer_tokenize( currentLexer, source, "(" );
+
+	lexer_tokenize( currentLexer, source, ")" );
+
+	lexer_tokenize( currentLexer, source, ";" );
+
+	lexer_tokenize( currentLexer, source, "<" );
+
+	lexer_tokenize( currentLexer, source, ">" );
+
+	lexer_tokenize( currentLexer, source, "=" );
+
+	lexer_tokenize( currentLexer, source, "->" );
+
+	lexer_tokenize( currentLexer, source, "." );
+
+	lexer_tokenize( currentLexer, source, "\"" );
+
+	lexer_tokenize( currentLexer, source, "#include" );
+
+	lexer_tokenize( currentLexer, source, "#" );
+
+	lexer_tokenize( currentLexer, source, "extends" );
+
+	lexer_tokenize( currentLexer, source, "reflect" );
+
+	lexer_tokenize( currentLexer, source, "template" );
+
+}
+
+int lexer_findBodyCloseIndex( lexer * currentLexer, int fromKey ) {
+
+	int count = lexer_length( currentLexer );
+
+	int depth = 0;
+
+	for (int i = fromKey; i < count; ++i)
+	{
+		int key					= lexer_getKey( currentLexer, i );
+
+		char * currentToken 	= lexer_getToken( currentLexer, i );
+
+
+		if ( strcmp( currentToken, "{" ) == 0 ) {
+
+			depth++;
+
+		}
+
+		if ( strcmp( currentToken, "}" ) == 0 ) {
+
+			depth--;
+
+		}
+
+		if( depth == 0 ) {
+
+			return i;
+
+		}
+		
+
+	}
+
+	return -1;
+
+}
+
+int lexer_findPreviousToken( lexer * currentLexer, int fromKey, char * token ) {
+
+	for (int i = fromKey - 1; i > 0; --i)
+	{
+		int key					= lexer_getKey( currentLexer, i );
+
+		char * currentToken 	= lexer_getToken( currentLexer, i );
+
+		if ( strcmp( currentToken, token ) == 0 ) {
+
+			return key;
+
+		}
+
+	}
+
+	return -1;
+
+}
+
+void lexer_sortKeys_separate(  struct array * keys, struct array * tokens ) {
+
+	int count = array_length( keys );
+
+	int i, j;
+
+	void * temp;
+
+	void * temp2;
+
+	for (i = 0; i < (count - 1); ++i)
+	{
+		for (j = 0; j < count - 1 - i; ++j )
+		{
+			intptr_t a = ( intptr_t ) array_get( keys, j );
+
+			intptr_t b = ( intptr_t ) array_get( keys, j+1 );
+
+			if (a > b)
+			{
+				temp = keys->items[j+1];
+				temp2 = tokens->items[j+1];
+
+				keys->items[j+1] = keys->items[j];
+				keys->items[j] = temp;
+
+				tokens->items[j+1] = tokens->items[j];
+				tokens->items[j] = temp2;
+			}
+		}
+	}
+
+}