//===-- Lexer.cpp ---------------------------------------------------------===//
//
// The KLEE Symbolic Virtual Machine
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "expr/Lexer.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
#include <iomanip>
#include <string.h>
using namespace llvm;
using namespace klee;
using namespace klee::expr;
///
const char *Token::getKindName() const {
switch (kind) {
default:
case Unknown: return "Unknown";
case Arrow: return "Arrow";
case At: return "At";
case Colon: return "Colon";
case Comma: return "Comma";
case Comment: return "Comment";
case EndOfFile: return "EndOfFile";
case Equals: return "Equals";
case Identifier: return "Identifier";
case KWArray: return "KWArray";
case KWFalse: return "KWFalse";
case KWQuery: return "KWQuery";
case KWReserved: return "KWReserved";
case KWSymbolic: return "KWSymbolic";
case KWTrue: return "KWTrue";
case KWWidth: return "KWWidth";
case LBrace: return "LBrace";
case LParen: return "LParen";
case LSquare: return "LSquare";
case Number: return "Number";
case RBrace: return "RBrace";
case RParen: return "RParen";
case RSquare: return "RSquare";
case Semicolon: return "Semicolon";
}
}
void Token::dump() {
llvm::errs() << "(Token \"" << getKindName() << "\" "
<< (const void*) start << " " << length << " "
<< line << " " << column << ")";
}
///
static inline bool isInternalIdentifierChar(int Char) {
return isalnum(Char) || Char == '_' || Char == '.' || Char == '-';
}
Lexer::Lexer(const llvm::MemoryBuffer *MB)
: BufferPos(MB->getBufferStart()), BufferEnd(MB->getBufferEnd()),
LineNumber(1), ColumnNumber(0) {
}
Lexer::~Lexer() {
}
int Lexer::PeekNextChar() {
if (BufferPos == BufferEnd)
return -1;
return *BufferPos;
}
int Lexer::GetNextChar() {
if (BufferPos == BufferEnd)
return -1;
// Handle DOS/Mac newlines here, by stripping duplicates and by
// returning '\n' for both.
char Result = *BufferPos++;
if (Result == '\n' || Result == '\r') {
if (BufferPos != BufferEnd && *BufferPos == ('\n' + '\r' - Result))
++BufferPos;
Result = '\n';
}
if (Result == '\n') {
++LineNumber;
ColumnNumber = 0;
} else {
++ColumnNumber;
}
return Result;
}
Token &Lexer::SetTokenKind(Token &Result, Token::Kind k) {
Result.kind = k;
Result.length = BufferPos - Result.start;
return Result;
}
static bool isReservedKW(const char *Str, unsigned N) {
unsigned i;
// Check for i[0-9]+
if (N>1 && Str[0] == 'i') {
for (i=1; i<N; ++i)
if (!isdigit(Str[i]))
break;
if (i==N)
return true;
}
// Check for fp[0-9]+([.].*)?$
if (N>3 && Str[0]=='f' && Str[1]=='p' && isdigit(Str[2])) {
for (i=3; i<N; ++i)
if (!isdigit(Str[i]))
break;
if (i==N || Str[i]=='.')
return true;
}
return false;
}
static bool isWidthKW(const char *Str, unsigned N) {
if (N<2 || Str[0] != 'w')
return false;
for (unsigned i=1; i<N; ++i)
if (!isdigit(Str[i]))
return false;
return true;
}
Token &Lexer::SetIdentifierTokenKind(Token &Result) {
unsigned Length = BufferPos - Result.start;
switch (Length) {
case 3:
if (memcmp("def", Result.start, 3) == 0)
return SetTokenKind(Result, Token::KWReserved);
if (memcmp("var", Result.start, 3) == 0)
return SetTokenKind(Result, Token::KWReserved);
break;
case 4:
if (memcmp("true", Result.start, 4) == 0)
return SetTokenKind(Result, Token::KWTrue);
break;
case 5:
if (memcmp("array", Result.start, 5) == 0)
return SetTokenKind(Result, Token::KWArray);
if (memcmp("false", Result.start, 5) == 0)
return SetTokenKind(Result, Token::KWFalse);
if (memcmp("query", Result.start, 5) == 0)
return SetTokenKind(Result, Token::KWQuery);
break;
case 6:
if (memcmp("define", Result.start, 6) == 0)
return SetTokenKind(Result, Token::KWReserved);
break;
case 7:
if (memcmp("declare", Result.start, 7) == 0)
return SetTokenKind(Result, Token::KWReserved);
break;
case 8:
if (memcmp("symbolic", Result.start, 8) == 0)
return SetTokenKind(Result, Token::KWSymbolic);
break;
}
if (isReservedKW(Result.start, Length))
return SetTokenKind(Result, Token::KWReserved);
if (isWidthKW(Result.start, Length))
return SetTokenKind(Result, Token::KWWidth);
return SetTokenKind(Result, Token::Identifier);
}
void Lexer::SkipToEndOfLine() {
for (;;) {
int Char = GetNextChar();
if (Char == -1 || Char =='\n')
break;
}
}
Token &Lexer::LexNumber(Token &Result) {
while (isalnum(PeekNextChar()) || PeekNextChar()=='_')
GetNextChar();
return SetTokenKind(Result, Token::Number);
}
Token &Lexer::LexIdentifier(Token &Result) {
while (isInternalIdentifierChar(PeekNextChar()))
GetNextChar();
// Recognize keywords specially.
return SetIdentifierTokenKind(Result);
}
Token &Lexer::Lex(Token &Result) {
Result.kind = Token::Unknown;
Result.length = 0;
Result.start = BufferPos;
// Skip whitespace.
while (isspace(PeekNextChar()))
GetNextChar();
Result.start = BufferPos;
Result.line = LineNumber;
Result.column = ColumnNumber;
int Char = GetNextChar();
switch (Char) {
case -1: return SetTokenKind(Result, Token::EndOfFile);
case '(': return SetTokenKind(Result, Token::LParen);
case ')': return SetTokenKind(Result, Token::RParen);
case ',': return SetTokenKind(Result, Token::Comma);
case ':': return SetTokenKind(Result, Token::Colon);
case ';': return SetTokenKind(Result, Token::Semicolon);
case '=': return SetTokenKind(Result, Token::Equals);
case '@': return SetTokenKind(Result, Token::At);
case '[': return SetTokenKind(Result, Token::LSquare);
case ']': return SetTokenKind(Result, Token::RSquare);
case '{': return SetTokenKind(Result, Token::LBrace);
case '}': return SetTokenKind(Result, Token::RBrace);
case '#':
SkipToEndOfLine();
return SetTokenKind(Result, Token::Comment);
case '+': {
if (isdigit(PeekNextChar()))
return LexNumber(Result);
else
return SetTokenKind(Result, Token::Unknown);
}
case '-': {
int Next = PeekNextChar();
if (Next == '>')
return GetNextChar(), SetTokenKind(Result, Token::Arrow);
else if (isdigit(Next))
return LexNumber(Result);
else
return SetTokenKind(Result, Token::Unknown);
break;
}
default:
if (isdigit(Char))
return LexNumber(Result);
else if (isalpha(Char) || Char == '_')
return LexIdentifier(Result);
return SetTokenKind(Result, Token::Unknown);
}
}