{ History ------- 20/12/85 - Assign Constant.Kind in MakeStringConst. (agh) } { MODULE 8 Lexical Analysis Lexical analysis is carried out by the procedure NextSymbol. When called, NextSymbol scans the next language symbol in the source stream and returns a representation of it in the following global variables:- Symbol in all cases Symbol represents the symbol scanned, as defined by the type SymbolType. Operator when Symbol = addop, mulop or relop, Operator represents the particular operator scanned, as defined by the type OpType. Spelling when Symbol = ident, the record variable Spelling holds the characters of the identifier scanned, as explained in 8.3. Constant when Symbol = intconst, realconst, charconst or stringconst, Constant holds a representation of the constant scanned. StartofSymbol to facilitate error reporting the position of the initial character of the symbol scanned is left in the global variable StartofSymbol. To initialise the lexical analysis process, and make the first symbol of the source program available, the procedure InitSymbol must be called before any call to NextSymbol. The primary interface presented to the syntax-semantic analyser is thus as follows:- var Symbol : SymbolType ; Operator : Optype ; Spelling : Alfa ; Constant : value ; StartofSymbol : TextPosition ; procedure NextSymbol ; procedure InitSymbol ; } program LexicalAnalyser; #include "globals.x" #include "options.pf" #include "source.pf" #include "interface.pf" { 8.1 Reserved word symbols The lexical analyser distinguishes between identifiers and reserved word symbols by table look-up on the array WordSymbol. Entries in WordSymbol are sorted by word length, with an extra slot at the end of each sequence to allow fast look-up with guaranteed success. The procedure InitWords initialises the WordSymbol array and the auxiliary indexing array LastofLength. } procedure InitWords; visible; const Irrelevant = NotOp; var LastEntered: 0..NoWords; procedure EnterWord (Name: AlfaHead; SValue: SymbolType; OValue: OpType); begin LastEntered := LastEntered + 1; with WordSymbol[LastEntered] do begin Spelling := Name; SymbolValue := SValue; OpValue := OValue end end; begin LastEntered := 0; LastOfLength[0] := LastEntered; EnterWord(' ', Ident, Irrelevant); LastOfLength[1] := LastEntered; EnterWord('IF ', IfSy, Irrelevant); EnterWord('DO ', DoSy, Irrelevant); EnterWord('OF ', OfSy, Irrelevant); EnterWord('TO ', ToSy, Plus); EnterWord('IN ', RelOp, InOp); EnterWord('OR ', AddOp, OrOp); EnterWord(' ', Ident, Irrelevant); LastOfLength[2] := LastEntered; EnterWord('END ', EndSy, Irrelevant); EnterWord('FOR ', ForSy, Irrelevant); EnterWord('VAR ', VarSy, Irrelevant); EnterWord('DIV ', MulOp, IDiv); EnterWord('MOD ', MulOp, IMod); EnterWord('SET ', SetSy, Irrelevant); EnterWord('AND ', MulOp, AndOp); EnterWord('NOT ', NotSy, Irrelevant); EnterWord('NIL ', NilSy, Irrelevant); EnterWord(' ', Ident, Irrelevant); LastOfLength[3] := LastEntered; EnterWord('THEN ', ThenSy, Irrelevant); EnterWord('ELSE ', ElseSy, Irrelevant); EnterWord('WITH ', WithSy, Irrelevant); EnterWord('GOTO ', GoToSy, Irrelevant); EnterWord('CASE ', CaseSy, Irrelevant); EnterWord('TYPE ', TypeSy, Irrelevant); EnterWord('FILE ', FileSy, Irrelevant); EnterWord(' ', Ident, Irrelevant); LastOfLength[4] := LastEntered; EnterWord('BEGIN ', BeginSy, Irrelevant); EnterWord('UNTIL ', UntilSy, Irrelevant); EnterWord('WHILE ', WhileSy, Irrelevant); EnterWord('ARRAY ', ArraySy, Irrelevant); EnterWord('CONST ', ConstSy, Irrelevant); EnterWord('LABEL ', LabelSy, Irrelevant); EnterWord(' ', Ident, Irrelevant); LastOfLength[5] := LastEntered; EnterWord('REPEAT ', RepeatSy, Irrelevant); EnterWord('RECORD ', RecordSy, Irrelevant); EnterWord('DOWNTO ', ToSy, Minus); EnterWord('PACKED ', PackedSy, Irrelevant); if ICLPascal then begin EnterWord('EXTERN ', ExternSy, Irrelevant); EnterWord('PRESET ', PresetSy, Irrelevant); EnterWord('PRAGMA ', PragmaSy, Irrelevant) end; EnterWord(' ', Ident, Irrelevant); LastOfLength[6] := LastEntered; if ICLPascal then EnterWord('VISIBLE ', VisibleSy, Irrelevant); EnterWord('PROGRAM ', ProgramSy, Irrelevant); EnterWord(' ', Ident, Irrelevant); LastOfLength[7] := LastEntered; EnterWord('FUNCTION ', FuncSy, Irrelevant); if ICLPascal then begin EnterWord('READONLY ', ReadOnlySy, Irrelevant); EnterWord('OCCUPIES ', OccupiesSy, Irrelevant) end; EnterWord(' ', Ident, Irrelevant); LastOfLength[8] := LastEntered; EnterWord('PROCEDURE ', ProcSy, Irrelevant); if ICLPascal then EnterWord('OTHERWISE ', OtherwiseSy, Irrelevant); EnterWord(' ', Ident, Irrelevant); LastOfLength[9] := LastEntered; EnterWord(' ', Ident, Irrelevant); LastOfLength[10] := LastEntered; EnterWord(' ', Ident, Irrelevant); LastOfLength[11] := LastEntered; EnterWord(' ', Ident, Irrelevant); LastOfLength[12] := LastEntered end { initwords }; { 8.2 Representation of identifier spellings. The analyser must accept identifiers of any length, treating all characters as significant. It does so by means of the type Alfa, holding up to twelve initial alphanumerics in the field 'head' of of each spelling, and appending the remainder in chained 'chunks' of up to four characters each. All characters of an identifier are thus retained and regarded as significant. The semantic analyser stores and compares the identifier spellings made available in this form. In principle identifier table search- ing requires functions of the form: function OrderedAlfa(a1, a2: alfa): Boolean; function SameAlfa(a1, a2: alfa): Boolean; but because fast table searching is crucial the analyser is assumed to do direct comparisons of alfa heads, using < or >, and to call a function: function OrderedTails(a1, a2: alfa): Boolean; only if the heads are identical. With this optimisation, the introduction of arbitrary length identifiers has no observable effect on compilation speed. To enable recovery of heap storage used by identifier tails a pro- cedure DisposeAlfa is provided, to be called for any alfa that is no longer required. Identifier spellings that are not retained by the analyser must be disposed automatically. To this end the analyser is required to 'copy' each spelling to be retained using the procedure CopySpel- ling. A spelling not copied in this way is disposed automatically by the next call to NextSymbol. In some cases it is necessary for the analyser to reset the global variable Spelling, either to a spelling previously copied or to a synthetic spelling supplied as a string parameter. Procedures Res- toreSpelling and MakeSpelling enable this to be done. The func- tion AlfaLength is used to generate the NameFile for Postmortem generation. } procedure InitAlfa(var AlfaSpelling: Alfa); visible; begin with AlfaSpelling do begin Head := ' '; Copy := Head; Tail := nil end end { InitAlfa }; function RankedTails(Tail1, Tail2: AlfaEntry): Boolean; visible; begin if (Tail1 <> nil) and (Tail2 <> nil) then if Tail1^.Chunk < Tail2^.Chunk then RankedTails := true else if Tail1^.Chunk > Tail2^.Chunk then RankedTails := false else RankedTails := RankedTails(Tail1^.NextChunk, Tail2^.NextChunk) else RankedTails := (Tail2 <> nil) end; function SameAlfa(Name1, Name2: Alfa): Boolean; visible; var Equal: Boolean; Tail1, Tail2: AlfaEntry; begin Equal := false; if Name1.Head = Name2.Head then if (Name1.Tail <> nil) and (Name2.Tail <> nil) then begin Tail1 := Name1.Tail; Tail2 := Name2.Tail; while (Tail1^.Chunk = Tail2^.Chunk) and (Tail1^.NextChunk <> nil) and (Tail2^.NextChunk <> nil) do begin Tail1 := Tail1^.NextChunk; Tail2 := Tail2^.NextChunk end; if (Tail1^.Chunk = Tail2^.Chunk) and (Tail1^.NextChunk = nil) and (Tail2^.NextChunk = nil) then Equal := true end else if (Name1.Tail = nil) and (Name2.Tail = nil) then Equal := true; SameAlfa := Equal end; function EquivAlfa(Name1, Name2: Alfa): Boolean; visible; begin EquivAlfa := Name1.Copy = Name2.Copy end { EquivAlfa }; procedure DispAlfa(LongIdent: Alfa); visible; var ThisEntry, NextEntry: AlfaEntry; begin NextEntry := LongIdent.Tail; while NextEntry <> nil do begin ThisEntry := NextEntry; NextEntry := ThisEntry^.NextChunk; dispose(ThisEntry) end end { dispalfa }; procedure DisposeSpelling; begin DispAlfa(Spelling); Spelling.Tail := nil; TailToBeDisposed := false end { disposespelling }; procedure CopySpelling(var Copy: Alfa); visible; begin Copy := Spelling; TailToBeDisposed := false end { copyspelling }; procedure RestoreSpelling(Copy: Alfa); visible; begin if TailToBeDisposed then DisposeSpelling; Spelling := Copy; TailToBeDisposed := (Spelling.Tail <> nil) end { restospelling }; procedure MakeSpelling(Header: AlfaHead); visible; begin if TailToBeDisposed then DisposeSpelling; Spelling.Head := Header; Spelling.Copy := Header; Spelling.Tail := nil end { makespelling }; { 8.3 The lexical analyser The lexical analyser NextSymbol is a deterministic acceptor which determines its progress from the next available source character, with the following qualifications: 1. The possibility of symbols '..' or '.)' immediately following an integer constant requires 2-character lookahead for their detection. In such cases the global flag PrecedingPeriod is set true to indicate to the subsequent call of NextSymbol that the period has already been scanned; at all other times PrecedingPeriod is false. 2. Reserved word symbols are distinguished from identifiers of similar length by use of the WordSymbol array described in 8.1. 3. The object value corresponding to an integer, real, character or string constant is obtained by collecting the source char- acter sequence involved in a standard form and passing this to the generator procedure Evaluate. 4. If a comment starts with the special option-flag character '$' the procedure SetLocalOption is called to interpret it, as outlined in Module 3. } procedure CopyChar(c: char); begin with Constant do if Length < LineMax then begin Length := Length + 1; String[Length] := c end else begin SystemError(6); Length := 0 end end; procedure NextSymbol; visible; label 9; var k: 0..AlfaSize; l: 1..NoWords; Base, LastLength: integer; StillInteger, Negative: Boolean; LastWasq, StringEnd: Boolean; LastEntry, NewEntry: AlfaEntry; function UpperCase(c: char): char; begin { This version assumes a contiguous collating } { sequence for the lower case letters. } UpperCase := chr(ord(c) - ord('a') + ord('A')) end; procedure CopyCharacter ; var Octal: 0..511; Count: 0..3; begin if ICLPascal and (Ch='\\') then begin NextCh; if Ch in UnixSpecials then case Ch of 'n': begin CopyChar(chr(NewLine)) ; NextCh end; 't' : begin CopyChar(chr(Tab)); NextCh end; 'b' : begin CopyChar(chr(BackSpace)); NextCh end; 'r' : begin CopyChar(chr(CarriageReturn)); NextCh end; 'f' : begin CopyChar(chr(FormFeed)); NextCh end; '\\' : begin CopyChar(chr(BackSlash)); NextCh end; '0', '1', '2', '3', '4', '5', '6', '7': begin Octal := 0 ; Count := 0; while (not LineEnded) and (Count < 3) and (Ch in DigitsForBase[8]) do begin Octal := Octal * 8 + ord(Ch) - ord('0'); Count := Count + 1 ; NextCh end; if Octal > 255 then Error(7, Source^.Position) else CopyChar(chr(Octal)) end end else Error(9, Source^.Position) end else begin CopyChar(Ch); NextCh end end { copycharacter }; procedure DigitSequence(Base: BaseValues); begin if Ch in DigitsForBase[Base] then repeat if Ch in LowerCaseLetters then Ch := UpperCase(Ch); CopyChar(Ch); NextCh until not (Ch in DigitsForBase[Base]) else begin CopyChar('0'); Error(1, Source^.Position) end end; procedure SkipComment; var EndOfComment: Boolean; begin EndOfComment := false; NextCh; repeat while not (Ch in ['}', '*', '{', '(' ]) do NextCh; case Ch of '}' : EndOfComment := true; '*' : begin NextCh; EndOfComment := (Ch = ')'); end; '{' : begin StartOfSymbol := Source^.Position; WarningError(2); NextCh end; '(' : begin StartOfSymbol := Source^.Position; NextCh; if Ch = '*' then WarningError(2) end end until EndOfComment; NextCh end { skipcomment }; procedure SkipSeparators; var Finished: Boolean; begin Finished := false; repeat if Ch <= ' ' then NextCh else if Ch = '{' then SkipComment else if Ch = '(' then begin NextCh; if Ch = '*' then SkipComment else Finished := true end else Finished := true until Finished end { SkipSeparators }; begin { nextsymbol } if TailToBeDisposed then DisposeSpelling; 9 : while Ch <= ' ' do NextCh; StartOfSymbol := Source^.Position; if Ch in SymbolStarters then case Ch of { analysis of special symbols other than word symbols } '+' : begin Symbol := AddOp; Operator := Plus; NextCh end; '-' : begin Symbol := AddOp; Operator := Minus; NextCh; if ICLPascal and (Ch = '>') then begin Symbol := Arrow; NextCh end end; '*' : begin Symbol := MulOp; Operator := Mul; NextCh end; '/' : begin Symbol := MulOp; Operator := RDiv; NextCh end; '=' : begin Symbol := RelOp; Operator := EqOp; NextCh; if ICLPascal and (Ch = '>') then begin Symbol := PresetOp; NextCh end end; '<' : begin NextCh; Symbol := RelOp; if Ch = '=' then begin Operator := LeOp; NextCh end else if Ch = '>' then begin Operator := NeOp; NextCh end else Operator := LtOp end; '>' : begin NextCh; Symbol := RelOp; if Ch = '=' then begin Operator := GeOp; NextCh end else Operator := GtOp end; '(' : begin NextCh; if Ch = '.' then begin Symbol := LeftBracket; NextCh end else if Ch = '*' then begin SkipComment; goto 9 end else Symbol := LeftParent end; ')' : begin if PrecedingPeriod then begin Symbol := RightBracket; PrecedingPeriod := false end else Symbol := RightParent; NextCh end; '[' : begin Symbol := LeftBracket; NextCh end; ']' : begin Symbol := RightBracket; NextCh end; '.' : begin if PrecedingPeriod then PrecedingPeriod := false else NextCh; if Ch = '.' then begin Symbol := Thru; NextCh end else if Ch = ')' then begin Symbol := RightBracket; NextCh end else Symbol := Period end; ',' : begin Symbol := Comma; NextCh end; ':' : begin NextCh; if Ch = '=' then begin Symbol := Becomes; NextCh end else Symbol := Colon end; ';' : begin Symbol := Semicolon; NextCh end; '@', '^' : begin Symbol := Arrow; NextCh end; '{' : begin SkipComment; goto 9 end; '&', '|' : { ICLPascal } begin Symbol := Bar; NextCh end; '?', '$' : { not to be implemented } Symbol := OtherSy; '!' : { ICLPascal } begin Symbol := Shriek; NextCh end; { analysis of word symbols and identifiers } 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' : begin k := 0; Spelling.Head := ' '; Spelling.Copy := Spelling.Head ; repeat k := k + 1; Spelling.Copy[k] := Ch; if Ch in LowerCaseLetters then Spelling.Head[k] := UpperCase(Ch) else Spelling.Head[k] := Ch; NextCh; while (Ch = '_') and ICLPascal do NextCh until (k = AlfaSize) or not (Ch in LettersAndDigits); if Ch in LettersAndDigits then begin LastEntry := nil; repeat k := 0; new(NewEntry); NewEntry^.Chunk := ' '; repeat if Ch in LowerCaseLetters then Ch := UpperCase(Ch); k := k + 1; NewEntry^.Chunk[k] := Ch; NextCh; while (Ch = '_') and ICLPascal do NextCh until (k = ChunkLength) or not (Ch in LettersAndDigits); if LastEntry = nil then Spelling.Tail := NewEntry else LastEntry^.NextChunk := NewEntry; LastEntry := NewEntry until not (Ch in LettersAndDigits); LastEntry^.NextChunk := nil; TailToBeDisposed := true; Symbol := Ident end else begin Spelling.Tail := nil; WordSymbol[LastOfLength[k]].Spelling := Spelling.Head; l := LastOfLength[k - 1] + 1; while WordSymbol[l].Spelling <> Spelling.Head do l := l + 1; with WordSymbol[l] do begin Symbol := SymbolValue; if Symbol in [MulOp, AddOp, RelOp, ToSy] then Operator := OpValue end end end; { analysis of integer and real constants } '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' : begin StillInteger := true; Constant.Length := 0; while Ch = '0' do NextCh; if Ch in ['1'..'9'] then DigitSequence(10) else CopyChar('0'); if ICLPascal and ((Ch = '#') or (Ch = '$')) then begin Constant.Kind := IntValue; Evaluate(Constant); Base := Constant.Velue.IVal; if not (Base in [2..16]) then begin Error(6, Source^.Position); Base := 16; Constant.Velue.Ival := Base; end; NextCh; Constant.Length := 0; DigitSequence(Base); Constant.Kind := BaseValue; Symbol := BasedConst; while Ch in DigitsForBase[16] do NextCh end else begin if Ch = '.' then begin NextCh; { if '.' is followed by a ')' or a second '.' then } { reset flag and leave the '.' or ')' for the next } { call to nextsymbol } if (Ch = ')') or (Ch = '.') then PrecedingPeriod := true else begin CopyChar('.'); StillInteger := false; DigitSequence(10) end end; if (Ch = 'e') or (Ch = 'E') then begin CopyChar('E'); StillInteger := false; NextCh; Negative := (Ch = '-'); if Negative or (Ch = '+') then NextCh; if Negative then CopyChar('-') else CopyChar('+'); DigitSequence(10) end; if StillInteger then begin Symbol := IntConst; Constant.Kind := IntValue end else begin Symbol := RealConst; Constant.Kind := RealValue end end; Evaluate(Constant); if Ch in Letters then Error(4, Source^.Position) end; { analysis of a character string } '''' : begin LastWasq := false; StringEnd := false; Constant.Length := 0; LastLength := 0; NextCh; while not StringEnd do begin repeat if LineEnded then begin Error(2, Source^.Position); StringEnd := true end else if (Ch <> '''') or LastWasq then begin CopyCharacter; LastWasq := false end else begin LastWasq := true; NextCh; StringEnd := Ch <> '''' end until StringEnd; LastWasq := false; { check for null-string } if (Constant.Length = LastLength) then Error(3, Source^.Position); { check for multi-string } if ICLPascal and (Constant.Length > 1) then begin SkipSeparators; if (Ch = '&') or (Ch = '|') then begin NextCh; SkipSeparators; if Ch <> '''' then Error(8, Source^.Position) else begin StringEnd := false; NextCh end; LastLength := Constant.Length end end end; with Constant do if Length <= 1 then begin Kind := CharValue; Symbol := CharConst end else begin Kind := StringValue; Symbol := StringConst end; Evaluate(Constant) end end else begin { deal with other (illegal) symbol } Symbol := OtherSy; NextCh end end { nextsymbol }; procedure MakeStringConst(Name: Alfa); visible; var Index: 1..AlfaSize; Trailer: AlfaEntry; procedure PutCh(c: char); begin if c <> ' ' then CopyChar(c) end; begin Constant.Kind := StringValue; Constant.Length := 0; for Index := 1 to AlfaSize do PutCh(Name.Head[Index]); Trailer := Name.Tail; while Trailer <> nil do begin for Index := 1 to ChunkLength do PutCh(Trailer^.Chunk[Index]); Trailer := Trailer^.NextChunk end; Evaluate(Constant) end { MakeStringConst }; { 8.4 Lexical initialisation The procedure InitSymbol, which is called before any other lexical utility, initialises the reserved word table, the variables Pre- cedingPeriod and TailToBeDisposed, and the 'constant' character sets used by Nextsymbol and then calls Nextsymbol to make the first symbol available to the syntax analyser. } procedure InitSymbol; visible; var b: BaseValues; begin InitWords; PrecedingPeriod := false; TailToBeDisposed := false; Constant.Kind := IntValue; LowerCaseLetters := ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']; Letters := ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] + LowerCaseLetters; Digits := ['0'..'9']; LettersAndDigits := Letters + Digits; SymbolStarters := ['+', '-', '*', '/', '=', '<', '>', '(', ')', '[', ']', '.', ',', ':', ';', '^', '@'] + Letters + Digits + [''''] + ['{']; if ICLPascal then SymbolStarters := SymbolStarters + ['|', '&', '?', '$']; if InHousePascal then SymbolStarters := SymbolStarters + ['!']; DigitsForBase[2] := ['0', '1']; for b := 3 to 10 do DigitsForBase[b] := DigitsForBase[b - 1] + [chr(ord('0') + b - 1)]; for b := 11 to 16 do DigitsForBase[b] := DigitsForBase[b - 1] + [chr(ord('A') + b - 11)] + [chr(ord('a') + b - 11)]; UnixSpecials := [ 'n', 't', 'b', 'r', 'f', '\\' ] + DigitsForBase[8]; NextSymbol end { InitSymbol}; begin { end of module } end.