Initial commit
This commit is contained in:
593
source/lexer.c
Executable file
593
source/lexer.c
Executable file
@@ -0,0 +1,593 @@
|
||||
|
||||
#include <lexer.h>
|
||||
|
||||
#include <replacement.h>
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
|
||||
#define ARRAY_SIZE( arr ) ( sizeof( ( arr ) ) / sizeof( ( arr )[0] ) )
|
||||
|
||||
|
||||
|
||||
lexer * lexer_new()
|
||||
{
|
||||
struct lexer * v;
|
||||
|
||||
v = ( struct lexer * ) malloc( sizeof( struct lexer ) );
|
||||
|
||||
|
||||
v->capacity = ARRAY_INIT_CAPACITY;
|
||||
|
||||
v->total = 0;
|
||||
|
||||
v->keys = malloc(sizeof( int ) * v->capacity );
|
||||
|
||||
v->tokens = malloc(sizeof( char * ) * v->capacity );
|
||||
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
int lexer_length( lexer * v )
|
||||
{
|
||||
|
||||
return v->total;
|
||||
|
||||
}
|
||||
|
||||
void lexer_resize( lexer * currentLexer, int capacity )
|
||||
{
|
||||
|
||||
int * keys = realloc( currentLexer->keys, sizeof( int ) * capacity );
|
||||
|
||||
char * * tokens = realloc( currentLexer->keys, sizeof( char * ) * capacity );
|
||||
|
||||
currentLexer->keys = keys;
|
||||
|
||||
currentLexer->tokens = tokens;
|
||||
|
||||
currentLexer->capacity = capacity;
|
||||
|
||||
}
|
||||
|
||||
void lexer_add( lexer * currentLexer, int key, char * token )
|
||||
{
|
||||
if ( currentLexer->capacity == currentLexer->total ){
|
||||
|
||||
lexer_resize( currentLexer, currentLexer->capacity * 2 );
|
||||
|
||||
}
|
||||
|
||||
currentLexer->keys[ currentLexer->total ] = key;
|
||||
|
||||
currentLexer->tokens[ currentLexer->total ] = token;
|
||||
|
||||
currentLexer->total++;
|
||||
|
||||
}
|
||||
|
||||
char * lexer_getToken( lexer * currentLexer, int index ) {
|
||||
|
||||
return currentLexer->tokens[ index ];
|
||||
|
||||
}
|
||||
|
||||
int lexer_getKey( lexer * currentLexer, int index ) {
|
||||
|
||||
return currentLexer->keys[ index ];
|
||||
|
||||
}
|
||||
|
||||
void lexer_setIndex( lexer * currentLexer, int index ) {
|
||||
|
||||
currentLexer->index = index;
|
||||
|
||||
}
|
||||
|
||||
int lexer_tokenize( lexer * currentLexer, char * haystack, char * needle )
|
||||
{
|
||||
int count = 0;
|
||||
|
||||
char * tmp = haystack;
|
||||
|
||||
while( tmp = strstr( tmp, needle ) ) {
|
||||
|
||||
int key = (int)( tmp - haystack );
|
||||
|
||||
lexer_add( currentLexer, key, needle );
|
||||
|
||||
tmp++;
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
int lexer_findPreviousKeyByToken( lexer * currentLexer, int fromKey, char * token ) {
|
||||
|
||||
for (int i = fromKey - 1; i >= 0; --i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
if ( strcmp( currentToken, token ) == 0 ) {
|
||||
|
||||
return key;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
int lexer_findPreviousTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
|
||||
|
||||
for (int i = fromKey - 1; i >= 0; --i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
if ( strcmp( currentToken, token ) == 0 ) {
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
void lexer_sortKeys( lexer * currentLexer ) {
|
||||
|
||||
int count = lexer_length( currentLexer );
|
||||
|
||||
int * keys = currentLexer->keys;
|
||||
|
||||
char * * tokens = currentLexer->tokens;
|
||||
|
||||
|
||||
int i, j;
|
||||
|
||||
int temp;
|
||||
|
||||
char * temp2;
|
||||
|
||||
for (i = 0; i < (count - 1); ++i)
|
||||
{
|
||||
for (j = 0; j < count - 1 - i; ++j )
|
||||
{
|
||||
int a = keys[j];
|
||||
|
||||
int b = keys[j+1];
|
||||
|
||||
if ( a > b )
|
||||
{
|
||||
|
||||
temp = keys[j+1];
|
||||
temp2 = tokens[j+1];
|
||||
|
||||
currentLexer->keys[j+1] = keys[j];
|
||||
currentLexer->keys[j] = temp;
|
||||
|
||||
currentLexer->tokens[j+1] = tokens[j];
|
||||
currentLexer->tokens[j] = temp2;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
int lexer_findNextToken( lexer * currentLexer, int fromKey, char * token ) {
|
||||
|
||||
int count = lexer_length( currentLexer );
|
||||
|
||||
for (int i = fromKey; i < count; ++i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
if ( strcmp( currentToken, token ) == 0 ) {
|
||||
|
||||
return key;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
int lexer_findNextTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
|
||||
|
||||
int count = lexer_length( currentLexer );
|
||||
|
||||
for (int i = fromKey; i < count; ++i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
if ( strcmp( currentToken, token ) == 0 ) {
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
// change name to something it does.
|
||||
int lexer_findPreviouseTokenIndex( lexer * currentLexer, int fromIndex, struct array * validPreviousTokens ) {
|
||||
|
||||
//int count = array_length( currentLexer );
|
||||
|
||||
|
||||
for (int i = fromIndex - 1; i >= 0; --i)
|
||||
{
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
int validTokenCount = array_length( validPreviousTokens );
|
||||
|
||||
for (int j = 0; j < validTokenCount; ++j)
|
||||
{
|
||||
|
||||
char * currentValidToken = array_get( validPreviousTokens, j );
|
||||
|
||||
//printf("find previous token: %s %s \n", currentToken, currentValidToken);
|
||||
|
||||
if ( strcmp( currentToken, currentValidToken ) == 0 )
|
||||
{
|
||||
|
||||
//printf("token found!!\n\n");
|
||||
|
||||
return key;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
int lexer_tokenizeRegex( lexer * currentLexer, char * haystack, char * needle, char * token ) {
|
||||
|
||||
const char * error;
|
||||
|
||||
unsigned char * name_table;
|
||||
|
||||
unsigned int option_bits;
|
||||
|
||||
int erroffset;
|
||||
|
||||
int crlf_is_newline;
|
||||
|
||||
int ovector[OVECCOUNT];
|
||||
|
||||
int subject_length;
|
||||
|
||||
int rc;
|
||||
|
||||
int utf8;
|
||||
|
||||
|
||||
|
||||
char * subject = text_copy( haystack );
|
||||
|
||||
subject_length = ( int ) strlen( subject );
|
||||
|
||||
pcre * re = pcre_compile( needle, /* the needle */
|
||||
0, /* default options */
|
||||
&error, /* for error message */
|
||||
&erroffset, /* for error offset */
|
||||
NULL); /* use default character tables */
|
||||
|
||||
|
||||
if ( re == NULL )
|
||||
{
|
||||
|
||||
printf( "PCRE compilation failed at offset %d: %s\n", erroffset, error );
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
rc = pcre_exec( re, /* the compiled needle */
|
||||
NULL, /* no extra data - we didn't study the needle */
|
||||
subject, /* the subject string */
|
||||
subject_length, /* the length of the subject */
|
||||
0, /* start at offset 0 in the subject */
|
||||
0, /* default options */
|
||||
ovector, /* output vector for substring information */
|
||||
OVECCOUNT ); /* number of elements in the output vector */
|
||||
|
||||
|
||||
if( rc < 0 ) {
|
||||
|
||||
switch( rc )
|
||||
{
|
||||
case PCRE_ERROR_NOMATCH:
|
||||
|
||||
//printf("No match\n");
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
printf("Matching error %d\n", rc);
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
pcre_free( re ); /* Release memory used for the compiled needle */
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
///printf("\nNew: %d\n", ovector[0] + 1);
|
||||
|
||||
//lexer_add( currentLexer, ovector[0] + 1, token );
|
||||
|
||||
|
||||
lexer_add( currentLexer, ovector[0]+1, token );
|
||||
|
||||
|
||||
|
||||
/* See if CRLF is a valid newline sequence. */
|
||||
|
||||
crlf_is_newline = option_bits == PCRE_NEWLINE_ANY ||
|
||||
option_bits == PCRE_NEWLINE_CRLF ||
|
||||
option_bits == PCRE_NEWLINE_ANYCRLF;
|
||||
|
||||
/* Loop for second and subsequent matches */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
int options = 0; /* Normally no options */
|
||||
int start_offset = ovector[1]; /* Start at end of previous match */
|
||||
|
||||
/* If the previous match was for an empty string, we are finished if we are
|
||||
at the end of the subject. Otherwise, arrange to run another match at the
|
||||
same point to see if a non-empty match can be found. */
|
||||
|
||||
if (ovector[0] == ovector[1])
|
||||
{
|
||||
if (ovector[0] == subject_length) break;
|
||||
options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
|
||||
}
|
||||
|
||||
/* Run the next matching operation */
|
||||
|
||||
rc = pcre_exec(
|
||||
re, /* the compiled needle */
|
||||
NULL, /* no extra data - we didn't study the needle */
|
||||
subject, /* the subject string */
|
||||
subject_length, /* the length of the subject */
|
||||
start_offset, /* starting offset in the subject */
|
||||
options, /* options */
|
||||
ovector, /* output vector for substring information */
|
||||
OVECCOUNT); /* number of elements in the output vector */
|
||||
|
||||
|
||||
if (rc == PCRE_ERROR_NOMATCH)
|
||||
{
|
||||
if (options == 0)
|
||||
break; /* All matches found */
|
||||
|
||||
ovector[1] = start_offset + 1; /* Advance one byte */
|
||||
|
||||
if ( crlf_is_newline && /* If CRLF is newline & */
|
||||
start_offset < subject_length - 1 && /* we are at CRLF, */
|
||||
subject[start_offset] == '\r' &&
|
||||
subject[start_offset + 1] == '\n') {
|
||||
|
||||
ovector[1] += 1; /* Advance by one more. */
|
||||
|
||||
} else if ( utf8 ) { /* Otherwise, ensure we */
|
||||
/* advance a whole UTF-8 */
|
||||
while (ovector[1] < subject_length) /* character. */
|
||||
{
|
||||
if ((subject[ovector[1]] & 0xc0) != 0x80)
|
||||
break;
|
||||
ovector[1] += 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
continue; /* Go round the loop again */
|
||||
}
|
||||
|
||||
/* Other matching errors are not recoverable. */
|
||||
|
||||
if (rc < 0)
|
||||
{
|
||||
printf("Matching error %d\n", rc);
|
||||
|
||||
pcre_free(re); /* Release memory used for the compiled needle */
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Match succeeded */
|
||||
|
||||
//printf("\nNew: %d\n", ovector[0]+1);
|
||||
|
||||
lexer_add( currentLexer, ovector[0]+1, token );
|
||||
|
||||
|
||||
|
||||
|
||||
/* The match succeeded, but the output vector wasn't big enough. */
|
||||
}
|
||||
|
||||
printf( "\n" );
|
||||
|
||||
pcre_free( re ); /* Release memory used for the compiled needle */
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
void lexer_getTokens( lexer * currentLexer, char * source ) {
|
||||
|
||||
//printf(" lexer_getTokens\n\n");
|
||||
|
||||
lexer_tokenizeRegex( currentLexer, source, "\\sclass\\s", "class" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "{" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "}" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "(" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, ")" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, ";" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "<" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, ">" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "=" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "->" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "." );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "\"" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "#include" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "#" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "extends" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "reflect" );
|
||||
|
||||
lexer_tokenize( currentLexer, source, "template" );
|
||||
|
||||
}
|
||||
|
||||
int lexer_findBodyCloseIndex( lexer * currentLexer, int fromKey ) {
|
||||
|
||||
int count = lexer_length( currentLexer );
|
||||
|
||||
int depth = 0;
|
||||
|
||||
for (int i = fromKey; i < count; ++i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
|
||||
if ( strcmp( currentToken, "{" ) == 0 ) {
|
||||
|
||||
depth++;
|
||||
|
||||
}
|
||||
|
||||
if ( strcmp( currentToken, "}" ) == 0 ) {
|
||||
|
||||
depth--;
|
||||
|
||||
}
|
||||
|
||||
if( depth == 0 ) {
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
int lexer_findPreviousToken( lexer * currentLexer, int fromKey, char * token ) {
|
||||
|
||||
for (int i = fromKey - 1; i > 0; --i)
|
||||
{
|
||||
int key = lexer_getKey( currentLexer, i );
|
||||
|
||||
char * currentToken = lexer_getToken( currentLexer, i );
|
||||
|
||||
if ( strcmp( currentToken, token ) == 0 ) {
|
||||
|
||||
return key;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
void lexer_sortKeys_separate( struct array * keys, struct array * tokens ) {
|
||||
|
||||
int count = array_length( keys );
|
||||
|
||||
int i, j;
|
||||
|
||||
void * temp;
|
||||
|
||||
void * temp2;
|
||||
|
||||
for (i = 0; i < (count - 1); ++i)
|
||||
{
|
||||
for (j = 0; j < count - 1 - i; ++j )
|
||||
{
|
||||
intptr_t a = ( intptr_t ) array_get( keys, j );
|
||||
|
||||
intptr_t b = ( intptr_t ) array_get( keys, j+1 );
|
||||
|
||||
if (a > b)
|
||||
{
|
||||
temp = keys->items[j+1];
|
||||
temp2 = tokens->items[j+1];
|
||||
|
||||
keys->items[j+1] = keys->items[j];
|
||||
keys->items[j] = temp;
|
||||
|
||||
tokens->items[j+1] = tokens->items[j];
|
||||
tokens->items[j] = temp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user