Скачиваний:
10
Добавлен:
01.05.2014
Размер:
16.65 Кб
Скачать
{
C-like language compiler v0.1.0.485

12 november 2007

Copyright (C) 2006, 2007 Igor Krooshch
rheo.ikro@gmail.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
}

unit UCcTokenizer;

interface

uses
SysUtils, Classes,
UCcBuffer;

type
TCcTokenSubType = (
tstNone, tstEof,
tstIdentifier,
// константы
tstIntConst, tstFloatConst, tstStrConst, tstCharConst,
// ключевые слова
tstIf, tstElse, tstWhile, tstDo, tstFor, tstBreak, tstContinue,
tstInt, tstFloat, tstChar, tstVoid, tstStruct, tstEnum, tstTypedef,
tstConst, tstDefault, tstReturn,

// операции
// assignment-operators
tstAssignment, tstMulAssign, tstDivAssign, tstRemAssign, tstPlusAssign,
tstMinusAssign, tstShlAssign, tstShrAssign, tstAndAssign, tstXorAssign,
tstOrAssign,

tstLess, tstGreater, tstEqual,
tstLessEqual, tstGreatEqual, tstNotEqual,
tstAnd, tstOr, tstXor, tstBitAnd, tstBitOr, tstNot, tstAddition, tstShl,
tstShr,
tstPlus, tstMinus, tstDiv, tstMul, tstRemainder, tstInc,
tstDec,

// разделители
tstOpenParent, tstCloseParent, tstOpenSqParent, tstCloseSqParent,
tstOpenBraces, tstCloseBraces, tstSemicolon, tstDot, tstComma, tstQuestion,
tstColon
);

TCcTokenSubTypes = set of TCcTokenSubType;

TCcTokenKeywords = tstIf..tstReturn;

TCcTokenOperators = tstAssignment..tstDec;
TCcTokenAssignmentOperators = tstAssignment..tstOrAssign;
TCcTokenCompareOperators = tstLess..tstNotEqual;
TCcTokenBinaryOperators = tstAnd..tstRemainder;

TCcTokenDelimiters = tstOpenParent..tstColon;

TCcTokenTypeSpecifiers = tstInt..tstEnum;
TCcTokenTypeQualifier = tstConst..tstConst;

TCcTokenValue = record
AsString: string;
case integer of
1: (AsInteger: integer);
2: (AsDouble: double);
3: (AsBoolean: boolean);
4: (AsByte: byte);
5: (AsChar: char);
end;

ECcTokenizerException = class(ECcBufferException);


TCcToken = class
private
FSubType: TCcTokenSubType;
FValue: TCcTokenValue;
FText: string;
FPosition: TCcPosition;
protected
procedure UpdateInfo(
ASubType: TCcTokenSubType; const AText: string; AValue: TCcTokenValue);
public
constructor Create;
procedure Assign(AToken: TObject);

property SubType: TCcTokenSubType read FSubType write FSubType;
property Text: string read FText write FText;
property Value: TCcTokenValue read FValue write FValue;
end;


TCcTokenizer = class
private
FBuffer: TCcBuffer;
FStream: TFileStream;

FDebugProgress: boolean;

FUngot: boolean;

FToken: TCcToken;

function IdentifyKeyword: boolean;

function ReadChar(out AEsc: boolean): char; overload;
function ReadChar: char; overload;

procedure ReadIdentifier;
procedure ReadIntOrFloatConst(const AEscSeq: boolean = false);
procedure ReadStrConst;
procedure ReadSymbol;

procedure SkipWhitespaces;
function GetPosition: TCcPosition;
public
constructor Create(
const AFileName: string; const ADebugProgress: boolean = false);
destructor Destroy; override;

procedure ErrorFmt(const AMsg: string; const AArgs: array of const);
procedure Error(const AMsg: string);

function GetToken: TCcToken;
function NextToken(const ANeedsNext: boolean = false): TCcToken;

function LookAhead: TCcToken;

procedure UngetToken;

procedure RequireToken(
ATokenType: TCcTokenSubType; const AStr: string = '');

property Position: TCcPosition read GetPosition;
end;


const
KW_VALUES: array[TCcTokenKeywords] of string = (
'if', 'else', 'while', 'do', 'for', 'break', 'continue',
'int', 'float', 'char', 'void', 'struct', 'enum', 'typedef',
'const', 'default', 'return'
);

OPER_VALUES: array[TCcTokenOperators] of string = (
'=', '*=', '/=', '%=', '+=', '-=', '<<=', '>>=', '&=', '^=', '|=',
'<', '>', '==', '<=', '>=', '!=', '&', '|', '^', '&&', '||', '!', '~',
'<<', '>>', '+', '-', '/', '*', '%', '++', '--'
);

DELIM_VALUES: array[TCcTokenDelimiters] of string = (
'(', ')', '[', ']', '{', '}', ';', '.', ',', '?', ':'
);


implementation

uses
UCcTokenizerConsts;

function IsDigit(A: char): boolean;
begin
Result := A in ['0'..'9'];
end;


function IsOctDigit(A: char): boolean;
begin
Result := A in ['0'..'7'];
end;


function IsHexDigit(A: char): boolean;
begin
Result := A in ['0'..'9', 'a'..'f', 'A'..'F'];
end;


function IsOpInList(const S: string): TCcTokenSubType;
var
t: TCcTokenSubType;
begin
Result := tstNone;
for t := Low(TCcTokenOperators) to High(TCcTokenOperators) do
if OPER_VALUES[t] = S then begin
Result := t;
Exit;
end;
end;


function IsNonDigit(A: char): boolean;
begin
Result := A in ['_', 'A'..'Z', 'a'..'z'];
end;


function OctToInt(const A: string): integer;
var
i: integer;
o: byte;
begin
o := Ord('0');
Result := 0;
for i := 1 to Length(A) do
Result := Result * 8 + Ord(A[i]) - o;
end;


{ TCcTokenizer }

constructor TCcTokenizer.Create(
const AFileName: string; const ADebugProgress: boolean);
begin
FStream := TFileStream.Create(AFileName, fmOpenRead);
FBuffer := TCcBuffer.Create(FStream);
FUngot := false;
FDebugProgress := ADebugProgress;

FToken := TCcToken.Create;
end;


destructor TCcTokenizer.Destroy;
begin
FToken.Free;
FBuffer.Free;
FStream.Free;
inherited;
end;


procedure TCcTokenizer.Error(const AMsg: string);
begin
FBuffer.Error(AMsg, ECcTokenizerException);
end;


procedure TCcTokenizer.ErrorFmt(
const AMsg: string; const AArgs: array of const);
begin
Error(Format(AMsg, AArgs));
end;


function TCcTokenizer.GetPosition: TCcPosition;
begin
Result := FBuffer.Position;
end;


function TCcTokenizer.GetToken: TCcToken;
begin
if FDebugProgress then
write('.');
if (FToken.SubType = tstNone) then
Result := NextToken
else
Result := FToken;
end;


function TCcTokenizer.IdentifyKeyword: boolean;
var
i: TCcTokenSubType;
begin
Result := false;
for i := Low(TCcTokenKeywords) to High(TCcTokenKeywords) do
if CompareStr(KW_VALUES[i], FToken.Text) = 0 then begin
FToken.SubType := i;
Result := true;
Break;
end;
end;


function TCcTokenizer.LookAhead: TCcToken;
begin
Result := NextToken;
UngetToken;
end;


function TCcTokenizer.NextToken(const ANeedsNext: boolean): TCcToken;
var
c: char;
begin
if FDebugProgress then
write('-');

if FUngot and not ANeedsNext then begin
FUngot := false;
Result := GetToken;
end
else begin
if ANeedsNext then
FUngot := false;
SkipWhitespaces;
c := FBuffer.GetChar;
FBuffer.UngetChar;
if (c = #0) then begin
FToken.SubType := tstEof;
FToken.Text := '';
end
else if isNonDigit(c) then
ReadIdentifier
else if isDigit(c) or (c = '.') then
if
(c = '.') and
(
(FToken.SubType = tstIdentifier) or
(FToken.SubType = tstCloseSqParent)
)
then begin
FToken.SubType := tstDot;
FToken.Text := c;
FBuffer.GetChar;
end
else
ReadIntOrFloatConst
else if c = '"' then
ReadStrConst
else if c = '''' then begin
FBuffer.GetChar;
c := ReadChar;
FToken.SubType := tstIntConst;
FToken.Text := c;
with FToken.Value do
AsInteger := Ord(c);
c := FBuffer.GetChar;
if (c <> '''') then
Error(SCcTokenUnexpectedSymbolError);
end
else
ReadSymbol;
Result := FToken;
end;
end;


function TCcTokenizer.ReadChar(out AEsc: boolean): char;
var
c: char;
begin
Result := #0;
c := FBuffer.GetChar;
AEsc := c = '\';
if (c = '\') then begin
c := FBuffer.GetChar;
if (c in ['x', 'X']) or IsOctDigit(c) then begin
FBuffer.UngetChar;
ReadIntOrFloatConst(true);
if not (FToken.Value.AsInteger in [0..255]) then
Error(SCcTokenCharCodeOutOfRange);
Result := FToken.Value.AsChar;
end
else
case c of
#0: Error(SCcTokenUnexpectedEOFError);
'"', '''', '\': Result := c;
'a', 'A': Result := #7;
'b', 'B': Result := #8;
't', 'T': Result := #9;
'n', 'N': Result := #$A;
'v', 'V': Result := #$B;
'r', 'R': Result := #$D;
'f', 'F': Result := #$C;
#13:
begin
SkipWhitespaces;
Result := FBuffer.GetChar;
if Result = #0 then
Error(SCcTokenUnexpectedEOFError);
end;
else
Result := c;
end;
end
else begin
if c in [#13, #10] then
Error(SCcTokenUnexpectedEndOfLine)
else
Result := c;
end;
end;


function TCcTokenizer.ReadChar: char;
var
b: boolean;
begin
Result := ReadChar(b);
end;


procedure TCcTokenizer.ReadIdentifier;
var
c: char;
begin
c := FBuffer.GetChar;
FToken.Text := '';
if not isNonDigit(c) then
Error(SCcTokenUnexpectedSymbolInIdentifier);
while (isNonDigit(c) or isDigit(c)) do begin
FToken.Text := FToken.Text + c;
c := FBuffer.GetChar;
end;
FBuffer.UngetChar;
if not IdentifyKeyword then
FToken.SubType := tstIdentifier;
end;


procedure TCcTokenizer.ReadIntOrFloatConst(const AEscSeq: boolean = false);
var
c: char;
iv: integer;
hasValue: boolean;

procedure readFloatConst;
var
s: string;
ds, c: char;
hasFloatValue: boolean;
begin
hasFloatValue := false;
ds := DecimalSeparator;
try
DecimalSeparator := '.';
s := IntToStr(iv) + '.';
c := FBuffer.GetChar;
if (c = '.') then
c := FBuffer.GetChar;
while isDigit(c) do begin
s := s + c;
c := FBuffer.GetChar;
hasFloatValue := true;
end;
if (c in ['e', 'E']) then begin
s := s + c;
c := FBuffer.GetChar;
if (c in ['+', '-']) then begin
s := s + c;
hasFloatValue := true;
c := FBuffer.GetChar;
end;
if not isDigit(c) then
Error(SCcTokenUnexpectedEndOfFloatExpConst);
while isDigit(c) do begin
s := s + c;
hasFloatValue := true;
c := FBuffer.GetChar;
end;
end;
if not (hasFloatValue or hasValue) then
Error(SCcTokenInFloatRepError);
FBuffer.UngetChar;
FToken.Text := s;
with FToken.Value do
AsDouble := StrToFloat(s);
FToken.SubType := tstFloatConst;
finally
DecimalSeparator := ds;
end;
end;


procedure readIntConst(const AEsc: boolean = false);
var
s: string;
begin
s := '';
if AEsc then
s := '0';
iv := 0;
if (c = '0') or AEsc then begin
while isOctDigit(c) do begin
s := s + c;
c := FBuffer.GetChar;
end;
if (c in ['x', 'X']) and (AEsc or (s = '0')) then begin
s := s + c;
c := FBuffer.GetChar;
while isHexDigit(c) do begin
s := s + c;
c := FBuffer.GetChar;
end;
if AEsc and (LowerCase(s) = '0x') then
Error(SCcTokenHexValueIsTooShort);

try
iv := StrToInt('$' + Copy(s, 3, Length(s) - 2));
except
Error(Format(SCcTokenInvalidIntValFmt, [s]));
end;
FBuffer.UngetChar;
end
else if (c in ['.', 'e', 'E']) then begin
if AEsc then
Error(SCcTokenFloatIsNotAllowed);
if (s <> '0') then
Error(SCcTokenIntegerPartError);
hasValue := s <> '';
FBuffer.UngetChar;
readFloatConst;
Exit;
end
else begin
iv := OctToInt(s);
FBuffer.UngetChar;
end;
end
else begin
while isDigit(c) do begin
s := s + c;
c := FBuffer.GetChar;
end;
iv := StrToInt(s);
if (c in ['.', 'e', 'E']) then begin
hasValue := s <> '';
FBuffer.UngetChar;
readFloatConst;
Exit;
end
else
FBuffer.UngetChar;
end;
FToken.Text := s;
with FToken.Value do
AsInteger := iv;
FToken.SubType := tstIntConst;
end;


begin
iv := 0;
hasValue := false;
c := FBuffer.GetChar;
if (c = '.') then
readFloatConst
else if AEscSeq or isDigit(c) then
readIntConst(AEscSeq)
else
Error(SCcTokenUndefinedTypeOfConst);
end;


procedure TCcTokenizer.ReadStrConst;
var
c: char;
s: string;
b: boolean;
begin
c := FBuffer.GetChar;
if c <> '"' then
Error(SCcTokenUnexpectedSymbolError);
s := '';
repeat
c := ReadChar(b);
if b or (c <> '"') then
s := s + c;
until (c = '"') and not b;
FToken.Text := s;
FToken.SubType := tstStrConst;
end;


procedure TCcTokenizer.ReadSymbol;
var
c: char;
s: string;
t, lt: TCcTokenSubType;
begin
c := FBuffer.GetChar;
for t := Low(TCcTokenDelimiters) to High(TCcTokenDelimiters) do
if DELIM_VALUES[t][1] = c then begin
FToken.SubType := t;
FToken.Text := c;
Exit;
end;
s := c;
t := IsOpInList(s);
lt := t;
while (c <> #0) and (t <> tstNone) do begin
c := FBuffer.GetChar;
s := s + c;
lt := t;
t := IsOpInList(s);
end;
Delete(s, Length(s), 1);
if (c <> #0) then
FBuffer.UngetChar;
if (s <> '') then begin
FToken.SubType := lt;
FToken.Text := s;
end
else
Error(SCcTokenUnexpectedSymbolError);
end;


procedure TCcTokenizer.RequireToken(
ATokenType: TCcTokenSubType; const AStr: string);
begin
if NextToken.SubType <> ATokenType then
ErrorFmt(SCcTokenExpectedButFoundFmt, [AStr, GetToken.Text]);
end;


procedure TCcTokenizer.SkipWhitespaces;

procedure skipComment1;
var
c: char;
begin
with FBuffer do
while true do begin
c := GetChar;
CheckEof(c);
if (c = '*') then begin
c := GetChar;
CheckEof(c);
if (c = '/') then
Break;
end;
end;
end;


procedure skipComment2;
var
c: char;
begin
with FBuffer do
while true do begin
c := GetChar;
if (c = #0) then begin
UngetChar;
Exit;
end;
if (c = #13) then begin
UngetChar;
Break;
end;
end;
end;

var
c: char;
begin
with FBuffer do begin
while (GetChar in [#32, #13, #10, #9]) do;
UngetChar;
if (GetChar = '/') then begin
c := GetChar;
if (c = '*') then begin
skipComment1;
SkipWhitespaces;
end
else if (c = '/') then begin
skipComment2;
SkipWhitespaces;
end
else begin
UngetChar;
UngetChar;
end;
end
else
UngetChar;
end;
end;


procedure TCcTokenizer.UngetToken;
begin
FUngot := true;
end;


{ TCcToken }

procedure TCcToken.Assign(AToken: TObject);
begin
if (AToken is TCcToken) then
with AToken as TCcToken do begin
Self.FSubType := FSubType;
Self.FValue := FValue;
Self.FText := FText;
Self.FPosition := FPosition;
end;
end;


constructor TCcToken.Create;
begin
FSubType := tstNone;
end;


procedure TCcToken.UpdateInfo(
ASubType: TCcTokenSubType; const AText: string; AValue: TCcTokenValue);
begin
FSubType := ASubType;
FText := AText;
FValue := AValue;
end;


end.
Соседние файлы в папке Tokenizer