593 lines
11 KiB
C
Executable File
593 lines
11 KiB
C
Executable File
|
|
#include <lexer.h>
|
|
|
|
#include <replacement.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
#define ARRAY_SIZE( arr ) ( sizeof( ( arr ) ) / sizeof( ( arr )[0] ) )
|
|
|
|
|
|
|
|
lexer * lexer_new()
|
|
{
|
|
struct lexer * v;
|
|
|
|
v = ( struct lexer * ) malloc( sizeof( struct lexer ) );
|
|
|
|
|
|
v->capacity = ARRAY_INIT_CAPACITY;
|
|
|
|
v->total = 0;
|
|
|
|
v->keys = malloc(sizeof( int ) * v->capacity );
|
|
|
|
v->tokens = malloc(sizeof( char * ) * v->capacity );
|
|
|
|
|
|
return v;
|
|
}
|
|
|
|
int lexer_length( lexer * v )
|
|
{
|
|
|
|
return v->total;
|
|
|
|
}
|
|
|
|
void lexer_resize( lexer * currentLexer, int capacity )
|
|
{
|
|
|
|
int * keys = realloc( currentLexer->keys, sizeof( int ) * capacity );
|
|
|
|
char * * tokens = realloc( currentLexer->keys, sizeof( char * ) * capacity );
|
|
|
|
currentLexer->keys = keys;
|
|
|
|
currentLexer->tokens = tokens;
|
|
|
|
currentLexer->capacity = capacity;
|
|
|
|
}
|
|
|
|
void lexer_add( lexer * currentLexer, int key, char * token )
|
|
{
|
|
if ( currentLexer->capacity == currentLexer->total ){
|
|
|
|
lexer_resize( currentLexer, currentLexer->capacity * 2 );
|
|
|
|
}
|
|
|
|
currentLexer->keys[ currentLexer->total ] = key;
|
|
|
|
currentLexer->tokens[ currentLexer->total ] = token;
|
|
|
|
currentLexer->total++;
|
|
|
|
}
|
|
|
|
char * lexer_getToken( lexer * currentLexer, int index ) {
|
|
|
|
return currentLexer->tokens[ index ];
|
|
|
|
}
|
|
|
|
int lexer_getKey( lexer * currentLexer, int index ) {
|
|
|
|
return currentLexer->keys[ index ];
|
|
|
|
}
|
|
|
|
void lexer_setIndex( lexer * currentLexer, int index ) {
|
|
|
|
currentLexer->index = index;
|
|
|
|
}
|
|
|
|
int lexer_tokenize( lexer * currentLexer, char * haystack, char * needle )
|
|
{
|
|
int count = 0;
|
|
|
|
char * tmp = haystack;
|
|
|
|
while( tmp = strstr( tmp, needle ) ) {
|
|
|
|
int key = (int)( tmp - haystack );
|
|
|
|
lexer_add( currentLexer, key, needle );
|
|
|
|
tmp++;
|
|
|
|
++count;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
int lexer_findPreviousKeyByToken( lexer * currentLexer, int fromKey, char * token ) {
|
|
|
|
for (int i = fromKey - 1; i >= 0; --i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
if ( strcmp( currentToken, token ) == 0 ) {
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
int lexer_findPreviousTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
|
|
|
|
for (int i = fromKey - 1; i >= 0; --i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
if ( strcmp( currentToken, token ) == 0 ) {
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
void lexer_sortKeys( lexer * currentLexer ) {
|
|
|
|
int count = lexer_length( currentLexer );
|
|
|
|
int * keys = currentLexer->keys;
|
|
|
|
char * * tokens = currentLexer->tokens;
|
|
|
|
|
|
int i, j;
|
|
|
|
int temp;
|
|
|
|
char * temp2;
|
|
|
|
for (i = 0; i < (count - 1); ++i)
|
|
{
|
|
for (j = 0; j < count - 1 - i; ++j )
|
|
{
|
|
int a = keys[j];
|
|
|
|
int b = keys[j+1];
|
|
|
|
if ( a > b )
|
|
{
|
|
|
|
temp = keys[j+1];
|
|
temp2 = tokens[j+1];
|
|
|
|
currentLexer->keys[j+1] = keys[j];
|
|
currentLexer->keys[j] = temp;
|
|
|
|
currentLexer->tokens[j+1] = tokens[j];
|
|
currentLexer->tokens[j] = temp2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
int lexer_findNextToken( lexer * currentLexer, int fromKey, char * token ) {
|
|
|
|
int count = lexer_length( currentLexer );
|
|
|
|
for (int i = fromKey; i < count; ++i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
if ( strcmp( currentToken, token ) == 0 ) {
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
int lexer_findNextTokenIndex( lexer * currentLexer, int fromKey, char * token ) {
|
|
|
|
int count = lexer_length( currentLexer );
|
|
|
|
for (int i = fromKey; i < count; ++i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
if ( strcmp( currentToken, token ) == 0 ) {
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
// change name to something it does.
|
|
int lexer_findPreviouseTokenIndex( lexer * currentLexer, int fromIndex, struct array * validPreviousTokens ) {
|
|
|
|
//int count = array_length( currentLexer );
|
|
|
|
|
|
for (int i = fromIndex - 1; i >= 0; --i)
|
|
{
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
int validTokenCount = array_length( validPreviousTokens );
|
|
|
|
for (int j = 0; j < validTokenCount; ++j)
|
|
{
|
|
|
|
char * currentValidToken = array_get( validPreviousTokens, j );
|
|
|
|
//printf("find previous token: %s %s \n", currentToken, currentValidToken);
|
|
|
|
if ( strcmp( currentToken, currentValidToken ) == 0 )
|
|
{
|
|
|
|
//printf("token found!!\n\n");
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
int lexer_tokenizeRegex( lexer * currentLexer, char * haystack, char * needle, char * token ) {
|
|
|
|
const char * error;
|
|
|
|
unsigned char * name_table;
|
|
|
|
unsigned int option_bits;
|
|
|
|
int erroffset;
|
|
|
|
int crlf_is_newline;
|
|
|
|
int ovector[OVECCOUNT];
|
|
|
|
int subject_length;
|
|
|
|
int rc;
|
|
|
|
int utf8;
|
|
|
|
|
|
|
|
char * subject = text_copy( haystack );
|
|
|
|
subject_length = ( int ) strlen( subject );
|
|
|
|
pcre * re = pcre_compile( needle, /* the needle */
|
|
0, /* default options */
|
|
&error, /* for error message */
|
|
&erroffset, /* for error offset */
|
|
NULL); /* use default character tables */
|
|
|
|
|
|
if ( re == NULL )
|
|
{
|
|
|
|
printf( "PCRE compilation failed at offset %d: %s\n", erroffset, error );
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
rc = pcre_exec( re, /* the compiled needle */
|
|
NULL, /* no extra data - we didn't study the needle */
|
|
subject, /* the subject string */
|
|
subject_length, /* the length of the subject */
|
|
0, /* start at offset 0 in the subject */
|
|
0, /* default options */
|
|
ovector, /* output vector for substring information */
|
|
OVECCOUNT ); /* number of elements in the output vector */
|
|
|
|
|
|
if( rc < 0 ) {
|
|
|
|
switch( rc )
|
|
{
|
|
case PCRE_ERROR_NOMATCH:
|
|
|
|
//printf("No match\n");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
printf("Matching error %d\n", rc);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
pcre_free( re ); /* Release memory used for the compiled needle */
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
///printf("\nNew: %d\n", ovector[0] + 1);
|
|
|
|
//lexer_add( currentLexer, ovector[0] + 1, token );
|
|
|
|
|
|
lexer_add( currentLexer, ovector[0]+1, token );
|
|
|
|
|
|
|
|
/* See if CRLF is a valid newline sequence. */
|
|
|
|
crlf_is_newline = option_bits == PCRE_NEWLINE_ANY ||
|
|
option_bits == PCRE_NEWLINE_CRLF ||
|
|
option_bits == PCRE_NEWLINE_ANYCRLF;
|
|
|
|
/* Loop for second and subsequent matches */
|
|
|
|
for (;;)
|
|
{
|
|
int options = 0; /* Normally no options */
|
|
int start_offset = ovector[1]; /* Start at end of previous match */
|
|
|
|
/* If the previous match was for an empty string, we are finished if we are
|
|
at the end of the subject. Otherwise, arrange to run another match at the
|
|
same point to see if a non-empty match can be found. */
|
|
|
|
if (ovector[0] == ovector[1])
|
|
{
|
|
if (ovector[0] == subject_length) break;
|
|
options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
|
|
}
|
|
|
|
/* Run the next matching operation */
|
|
|
|
rc = pcre_exec(
|
|
re, /* the compiled needle */
|
|
NULL, /* no extra data - we didn't study the needle */
|
|
subject, /* the subject string */
|
|
subject_length, /* the length of the subject */
|
|
start_offset, /* starting offset in the subject */
|
|
options, /* options */
|
|
ovector, /* output vector for substring information */
|
|
OVECCOUNT); /* number of elements in the output vector */
|
|
|
|
|
|
if (rc == PCRE_ERROR_NOMATCH)
|
|
{
|
|
if (options == 0)
|
|
break; /* All matches found */
|
|
|
|
ovector[1] = start_offset + 1; /* Advance one byte */
|
|
|
|
if ( crlf_is_newline && /* If CRLF is newline & */
|
|
start_offset < subject_length - 1 && /* we are at CRLF, */
|
|
subject[start_offset] == '\r' &&
|
|
subject[start_offset + 1] == '\n') {
|
|
|
|
ovector[1] += 1; /* Advance by one more. */
|
|
|
|
} else if ( utf8 ) { /* Otherwise, ensure we */
|
|
/* advance a whole UTF-8 */
|
|
while (ovector[1] < subject_length) /* character. */
|
|
{
|
|
if ((subject[ovector[1]] & 0xc0) != 0x80)
|
|
break;
|
|
ovector[1] += 1;
|
|
}
|
|
|
|
}
|
|
|
|
continue; /* Go round the loop again */
|
|
}
|
|
|
|
/* Other matching errors are not recoverable. */
|
|
|
|
if (rc < 0)
|
|
{
|
|
printf("Matching error %d\n", rc);
|
|
|
|
pcre_free(re); /* Release memory used for the compiled needle */
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Match succeeded */
|
|
|
|
//printf("\nNew: %d\n", ovector[0]+1);
|
|
|
|
lexer_add( currentLexer, ovector[0]+1, token );
|
|
|
|
|
|
|
|
|
|
/* The match succeeded, but the output vector wasn't big enough. */
|
|
}
|
|
|
|
printf( "\n" );
|
|
|
|
pcre_free( re ); /* Release memory used for the compiled needle */
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
void lexer_getTokens( lexer * currentLexer, char * source ) {
|
|
|
|
//printf(" lexer_getTokens\n\n");
|
|
|
|
lexer_tokenizeRegex( currentLexer, source, "\\sclass\\s", "class" );
|
|
|
|
lexer_tokenize( currentLexer, source, "{" );
|
|
|
|
lexer_tokenize( currentLexer, source, "}" );
|
|
|
|
lexer_tokenize( currentLexer, source, "(" );
|
|
|
|
lexer_tokenize( currentLexer, source, ")" );
|
|
|
|
lexer_tokenize( currentLexer, source, ";" );
|
|
|
|
lexer_tokenize( currentLexer, source, "<" );
|
|
|
|
lexer_tokenize( currentLexer, source, ">" );
|
|
|
|
lexer_tokenize( currentLexer, source, "=" );
|
|
|
|
lexer_tokenize( currentLexer, source, "->" );
|
|
|
|
lexer_tokenize( currentLexer, source, "." );
|
|
|
|
lexer_tokenize( currentLexer, source, "\"" );
|
|
|
|
lexer_tokenize( currentLexer, source, "#include" );
|
|
|
|
lexer_tokenize( currentLexer, source, "#" );
|
|
|
|
lexer_tokenize( currentLexer, source, "extends" );
|
|
|
|
lexer_tokenize( currentLexer, source, "reflect" );
|
|
|
|
lexer_tokenize( currentLexer, source, "template" );
|
|
|
|
}
|
|
|
|
int lexer_findBodyCloseIndex( lexer * currentLexer, int fromKey ) {
|
|
|
|
int count = lexer_length( currentLexer );
|
|
|
|
int depth = 0;
|
|
|
|
for (int i = fromKey; i < count; ++i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
|
|
if ( strcmp( currentToken, "{" ) == 0 ) {
|
|
|
|
depth++;
|
|
|
|
}
|
|
|
|
if ( strcmp( currentToken, "}" ) == 0 ) {
|
|
|
|
depth--;
|
|
|
|
}
|
|
|
|
if( depth == 0 ) {
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
int lexer_findPreviousToken( lexer * currentLexer, int fromKey, char * token ) {
|
|
|
|
for (int i = fromKey - 1; i > 0; --i)
|
|
{
|
|
int key = lexer_getKey( currentLexer, i );
|
|
|
|
char * currentToken = lexer_getToken( currentLexer, i );
|
|
|
|
if ( strcmp( currentToken, token ) == 0 ) {
|
|
|
|
return key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
void lexer_sortKeys_separate( struct array * keys, struct array * tokens ) {
|
|
|
|
int count = array_length( keys );
|
|
|
|
int i, j;
|
|
|
|
void * temp;
|
|
|
|
void * temp2;
|
|
|
|
for (i = 0; i < (count - 1); ++i)
|
|
{
|
|
for (j = 0; j < count - 1 - i; ++j )
|
|
{
|
|
intptr_t a = ( intptr_t ) array_get( keys, j );
|
|
|
|
intptr_t b = ( intptr_t ) array_get( keys, j+1 );
|
|
|
|
if (a > b)
|
|
{
|
|
temp = keys->items[j+1];
|
|
temp2 = tokens->items[j+1];
|
|
|
|
keys->items[j+1] = keys->items[j];
|
|
keys->items[j] = temp;
|
|
|
|
tokens->items[j+1] = tokens->items[j];
|
|
tokens->items[j] = temp2;
|
|
}
|
|
}
|
|
}
|
|
|
|
} |