ddmd.lexer source code

1 // Compiler implementation of the D programming language
2 // Copyright (c) 1999-2015 by Digital Mars
3 // All Rights Reserved
4 // written by Walter Bright
5 // http://www.digitalmars.com
6 // Distributed under the Boost Software License, Version 1.0.
7 // http://www.boost.org/LICENSE_1_0.txt
8 
9 module ddmd.lexer;
10 
11 import core.stdc.ctype;
12 import core.stdc.errno;
13 import core.stdc.stdarg;
14 import core.stdc.stdio;
15 import core.stdc..string;
16 import core.stdc.time;
17 
18 import ddmd.entity;
19 import ddmd.errors;
20 import ddmd.globals;
21 import ddmd.id;
22 import ddmd.identifier;
23 import ddmd.root.longdouble;
24 import ddmd.root.outbuffer;
25 import ddmd.root.port;
26 import ddmd.root.rmem;
27 import ddmd.root.stringtable;
28 import ddmd.tokens;
29 import ddmd.utf;
30 
31 enum LS = 0x2028;       // UTF line separator
32 enum PS = 0x2029;       // UTF paragraph separator
33 
34 /********************************************
35  * Do our own char maps
36  */
37 immutable ubyte[256] cmtable;
38 enum CMoctal  = 0x1;
39 enum CMhex    = 0x2;
40 enum CMidchar = 0x4;
41 enum CMzerosecond = 0x8;
42 enum CMdigitsecond = 0x10;
43 enum CMsinglechar = 0x20;
44 
45 bool isoctal(char c)
46 {
47     return (cmtable[c] & CMoctal) != 0;
48 }
49 
50 bool ishex(char c)
51 {
52     return (cmtable[c] & CMhex) != 0;
53 }
54 
55 bool isidchar(char c)
56 {
57     return (cmtable[c] & CMidchar) != 0;
58 }
59 
60 bool isZeroSecond(char c)
61 {
62     return (cmtable[c] & CMzerosecond) != 0;
63 }
64 
65 bool isDigitSecond(char c)
66 {
67     return (cmtable[c] & CMdigitsecond) != 0;
68 }
69 
70 bool issinglechar(char c)
71 {
72     return (cmtable[c] & CMsinglechar) != 0;
73 }
74 
75 static this()
76 {
77     foreach (const c; 0 .. cmtable.length)
78     {
79         if ('0' <= c && c <= '7')
80             cmtable[c] |= CMoctal;
81         if (isxdigit(c))
82             cmtable[c] |= CMhex;
83         if (isalnum(c) || c == '_')
84             cmtable[c] |= CMidchar;
85 
86         switch (c)
87         {
88             case 'x': case 'X':
89             case 'b': case 'B':
90                 cmtable[c] |= CMzerosecond;
91                 break;
92 
93             case '0': .. case '9':
94             case 'e': case 'E':
95             case 'f': case 'F':
96             case 'l': case 'L':
97             case 'p': case 'P':
98             case 'u': case 'U':
99             case 'i':
100             case '.':
101             case '_':
102                 cmtable[c] |= CMzerosecond | CMdigitsecond;
103                 break;
104 
105             default:
106                 break;
107         }
108 
109         switch (c)
110         {
111             case '\\':
112             case '\n':
113             case '\r':
114             case 0:
115             case 0x1A:
116             case '\'':
117                 break;
118             default:
119                 if (!(c & 0x80))
120                     cmtable[c] |= CMsinglechar;
121                 break;
122         }
123     }
124 }
125 
126 unittest
127 {
128     //printf("lexer.unittest\n");
129     /* Not much here, just trying things out.
130      */
131     string text = "int";
132     scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0);
133     TOK tok;
134     tok = lex1.nextToken();
135     //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32);
136     assert(tok == TOKint32);
137     tok = lex1.nextToken();
138     assert(tok == TOKeof);
139     tok = lex1.nextToken();
140     assert(tok == TOKeof);
141 }
142 
143 /***********************************************************
144  */
145 class Lexer
146 {
147 public:
148     __gshared OutBuffer stringbuffer;
149 
150     Loc scanloc;            // for error messages
151 
152     const(char)* base;      // pointer to start of buffer
153     const(char)* end;       // past end of buffer
154     const(char)* p;         // current character
155     const(char)* line;      // start of current line
156     Token token;
157     bool doDocComment;      // collect doc comment information
158     bool anyToken;          // seen at least one token
159     bool commentToken;      // comments are TOKcomment's
160     bool errors;            // errors occurred during lexing or parsing
161 
162     /*********************
163      * Creates a Lexer.
164      * Params:
165      *  filename = used for error messages
166      *  base = source code, ending in a 0 byte
167      *  begoffset = starting offset into base[]
168      *  endoffset = last offset into base[]
169      *  doDocComment = handle documentation comments
170      *  commentToken = comments become TOKcomment's
171      */
172     this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken)
173     {
174         scanloc = Loc(filename, 1, 1);
175         //printf("Lexer::Lexer(%p,%d)\n",base,length);
176         //printf("lexer.filename = %s\n", filename);
177         token = Token.init;
178         this.base = base;
179         this.end = base + endoffset;
180         p = base + begoffset;
181         line = p;
182         this.doDocComment = doDocComment;
183         this.commentToken = commentToken;
184         //initKeywords();
185         /* If first line starts with '#!', ignore the line
186          */
187         if (p[0] == '#' && p[1] == '!')
188         {
189             p += 2;
190             while (1)
191             {
192                 char c = *p;
193                 switch (c)
194                 {
195                 case '\n':
196                     p++;
197                     break;
198                 case '\r':
199                     p++;
200                     if (*p == '\n')
201                         p++;
202                     break;
203                 case 0:
204                 case 0x1A:
205                     break;
206                 default:
207                     if (c & 0x80)
208                     {
209                         uint u = decodeUTF();
210                         if (u == PS || u == LS)
211                             break;
212                     }
213                     p++;
214                     continue;
215                 }
216                 break;
217             }
218             endOfLine();
219         }
220     }
221 
222     final TOK nextToken()
223     {
224         if (token.next)
225         {
226             Token* t = token.next;
227             memcpy(&token, t, Token.sizeof);
228             t.free();
229         }
230         else
231         {
232             scan(&token);
233         }
234         //token.print();
235         return token.value;
236     }
237 
238     /***********************
239      * Look ahead at next token's value.
240      */
241     final TOK peekNext()
242     {
243         return peek(&token).value;
244     }
245 
246     /***********************
247      * Look 2 tokens ahead at value.
248      */
249     final TOK peekNext2()
250     {
251         Token* t = peek(&token);
252         return peek(t).value;
253     }
254 
255     /****************************
256      * Turn next token in buffer into a token.
257      */
258     final void scan(Token* t)
259     {
260         const lastLine = scanloc.linnum;
261         Loc startLoc;
262         t.blockComment = null;
263         t.lineComment = null;
264         while (1)
265         {
266             t.ptr = p;
267             //printf("p = %p, *p = '%c'\n",p,*p);
268             t.loc = loc();
269             switch (*p)
270             {
271             case 0:
272             case 0x1A:
273                 t.value = TOKeof; // end of file
274                 return;
275             case ' ':
276             case '\t':
277             case '\v':
278             case '\f':
279                 p++;
280                 continue;
281                 // skip white space
282             case '\r':
283                 p++;
284                 if (*p != '\n') // if CR stands by itself
285                     endOfLine();
286                 continue;
287                 // skip white space
288             case '\n':
289                 p++;
290                 endOfLine();
291                 continue;
292                 // skip white space
293             case '0':
294                 if (!isZeroSecond(p[1]))        // if numeric literal does not continue
295                 {
296                     ++p;
297                     t.uns64value = 0;
298                     t.value = TOKint32v;
299                     return;
300                 }
301                 goto Lnumber;
302 
303             case '1': .. case '9':
304                 if (!isDigitSecond(p[1]))       // if numeric literal does not continue
305                 {
306                     t.uns64value = *p - '0';
307                     ++p;
308                     t.value = TOKint32v;
309                     return;
310                 }
311             Lnumber:
312                 t.value = number(t);
313                 return;
314 
315             case '\'':
316                 if (issinglechar(p[1]) && p[2] == '\'')
317                 {
318                     t.uns64value = p[1];        // simple one character literal
319                     t.value = TOKcharv;
320                     p += 3;
321                 }
322                 else
323                     t.value = charConstant(t);
324                 return;
325             case 'r':
326                 if (p[1] != '"')
327                     goto case_ident;
328                 p++;
329                 goto case '`';
330             case '`':
331                 t.value = wysiwygStringConstant(t, *p);
332                 return;
333             case 'x':
334                 if (p[1] != '"')
335                     goto case_ident;
336                 p++;
337                 t.value = hexStringConstant(t);
338                 return;
339             case 'q':
340                 if (p[1] == '"')
341                 {
342                     p++;
343                     t.value = delimitedStringConstant(t);
344                     return;
345                 }
346                 else if (p[1] == '{')
347                 {
348                     p++;
349                     t.value = tokenStringConstant(t);
350                     return;
351                 }
352                 else
353                     goto case_ident;
354             case '"':
355                 t.value = escapeStringConstant(t, 0);
356                 return;
357             case 'a':
358             case 'b':
359             case 'c':
360             case 'd':
361             case 'e':
362             case 'f':
363             case 'g':
364             case 'h':
365             case 'i':
366             case 'j':
367             case 'k':
368             case 'l':
369             case 'm':
370             case 'n':
371             case 'o':
372             case 'p':
373                 /*case 'q': case 'r':*/
374             case 's':
375             case 't':
376             case 'u':
377             case 'v':
378             case 'w':
379                 /*case 'x':*/
380             case 'y':
381             case 'z':
382             case 'A':
383             case 'B':
384             case 'C':
385             case 'D':
386             case 'E':
387             case 'F':
388             case 'G':
389             case 'H':
390             case 'I':
391             case 'J':
392             case 'K':
393             case 'L':
394             case 'M':
395             case 'N':
396             case 'O':
397             case 'P':
398             case 'Q':
399             case 'R':
400             case 'S':
401             case 'T':
402             case 'U':
403             case 'V':
404             case 'W':
405             case 'X':
406             case 'Y':
407             case 'Z':
408             case '_':
409             case_ident:
410                 {
411                     while (1)
412                     {
413                         const c = *++p;
414                         if (isidchar(c))
415                             continue;
416                         else if (c & 0x80)
417                         {
418                             const s = p;
419                             const u = decodeUTF();
420                             if (isUniAlpha(u))
421                                 continue;
422                             error("char 0x%04x not allowed in identifier", u);
423                             p = s;
424                         }
425                         break;
426                     }
427                     Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr);
428                     t.ident = id;
429                     t.value = cast(TOK)id.value;
430                     anyToken = 1;
431                     if (*t.ptr == '_') // if special identifier token
432                     {
433                         __gshared bool initdone = false;
434                         __gshared char[11 + 1] date;
435                         __gshared char[8 + 1] time;
436                         __gshared char[24 + 1] timestamp;
437                         if (!initdone) // lazy evaluation
438                         {
439                             initdone = true;
440                             time_t ct;
441                             .time(&ct);
442                             const p = ctime(&ct);
443                             assert(p);
444                             sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
445                             sprintf(&time[0], "%.8s", p + 11);
446                             sprintf(&timestamp[0], "%.24s", p);
447                         }
448                         if (id == Id.DATE)
449                         {
450                             t.ustring = date.ptr;
451                             goto Lstr;
452                         }
453                         else if (id == Id.TIME)
454                         {
455                             t.ustring = time.ptr;
456                             goto Lstr;
457                         }
458                         else if (id == Id.VENDOR)
459                         {
460                             t.ustring = global.compiler.vendor;
461                             goto Lstr;
462                         }
463                         else if (id == Id.TIMESTAMP)
464                         {
465                             t.ustring = timestamp.ptr;
466                         Lstr:
467                             t.value = TOKstring;
468                             t.postfix = 0;
469                             t.len = cast(uint)strlen(t.ustring);
470                         }
471                         else if (id == Id.VERSIONX)
472                         {
473                             uint major = 0;
474                             uint minor = 0;
475                             bool point = false;
476                             for (const(char)* p = global._version + 1; 1; p++)
477                             {
478                                 const c = *p;
479                                 if (isdigit(cast(char)c))
480                                     minor = minor * 10 + c - '0';
481                                 else if (c == '.')
482                                 {
483                                     if (point)
484                                         break;
485                                     // ignore everything after second '.'
486                                     point = true;
487                                     major = minor;
488                                     minor = 0;
489                                 }
490                                 else
491                                     break;
492                             }
493                             t.value = TOKint64v;
494                             t.uns64value = major * 1000 + minor;
495                         }
496                         else if (id == Id.EOFX)
497                         {
498                             t.value = TOKeof;
499                             // Advance scanner to end of file
500                             while (!(*p == 0 || *p == 0x1A))
501                                 p++;
502                         }
503                     }
504                     //printf("t->value = %d\n",t->value);
505                     return;
506                 }
507             case '/':
508                 p++;
509                 switch (*p)
510                 {
511                 case '=':
512                     p++;
513                     t.value = TOKdivass;
514                     return;
515                 case '*':
516                     p++;
517                     startLoc = loc();
518                     while (1)
519                     {
520                         while (1)
521                         {
522                             const c = *p;
523                             switch (c)
524                             {
525                             case '/':
526                                 break;
527                             case '\n':
528                                 endOfLine();
529                                 p++;
530                                 continue;
531                             case '\r':
532                                 p++;
533                                 if (*p != '\n')
534                                     endOfLine();
535                                 continue;
536                             case 0:
537                             case 0x1A:
538                                 error("unterminated /* */ comment");
539                                 p = end;
540                                 t.loc = loc();
541                                 t.value = TOKeof;
542                                 return;
543                             default:
544                                 if (c & 0x80)
545                                 {
546                                     const u = decodeUTF();
547                                     if (u == PS || u == LS)
548                                         endOfLine();
549                                 }
550                                 p++;
551                                 continue;
552                             }
553                             break;
554                         }
555                         p++;
556                         if (p[-2] == '*' && p - 3 != t.ptr)
557                             break;
558                     }
559                     if (commentToken)
560                     {
561                         t.loc = startLoc;
562                         t.value = TOKcomment;
563                         return;
564                     }
565                     else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr)
566                     {
567                         // if /** but not /**/
568                         getDocComment(t, lastLine == startLoc.linnum);
569                     }
570                     continue;
571                 case '/':
572                     // do // style comments
573                     startLoc = loc();
574                     while (1)
575                     {
576                         const c = *++p;
577                         switch (c)
578                         {
579                         case '\n':
580                             break;
581                         case '\r':
582                             if (p[1] == '\n')
583                                 p++;
584                             break;
585                         case 0:
586                         case 0x1A:
587                             if (commentToken)
588                             {
589                                 p = end;
590                                 t.loc = startLoc;
591                                 t.value = TOKcomment;
592                                 return;
593                             }
594                             if (doDocComment && t.ptr[2] == '/')
595                                 getDocComment(t, lastLine == startLoc.linnum);
596                             p = end;
597                             t.loc = loc();
598                             t.value = TOKeof;
599                             return;
600                         default:
601                             if (c & 0x80)
602                             {
603                                 const u = decodeUTF();
604                                 if (u == PS || u == LS)
605                                     break;
606                             }
607                             continue;
608                         }
609                         break;
610                     }
611                     if (commentToken)
612                     {
613                         p++;
614                         endOfLine();
615                         t.loc = startLoc;
616                         t.value = TOKcomment;
617                         return;
618                     }
619                     if (doDocComment && t.ptr[2] == '/')
620                         getDocComment(t, lastLine == startLoc.linnum);
621                     p++;
622                     endOfLine();
623                     continue;
624                 case '+':
625                     {
626                         int nest;
627                         startLoc = loc();
628                         p++;
629                         nest = 1;
630                         while (1)
631                         {
632                             char c = *p;
633                             switch (c)
634                             {
635                             case '/':
636                                 p++;
637                                 if (*p == '+')
638                                 {
639                                     p++;
640                                     nest++;
641                                 }
642                                 continue;
643                             case '+':
644                                 p++;
645                                 if (*p == '/')
646                                 {
647                                     p++;
648                                     if (--nest == 0)
649                                         break;
650                                 }
651                                 continue;
652                             case '\r':
653                                 p++;
654                                 if (*p != '\n')
655                                     endOfLine();
656                                 continue;
657                             case '\n':
658                                 endOfLine();
659                                 p++;
660                                 continue;
661                             case 0:
662                             case 0x1A:
663                                 error("unterminated /+ +/ comment");
664                                 p = end;
665                                 t.loc = loc();
666                                 t.value = TOKeof;
667                                 return;
668                             default:
669                                 if (c & 0x80)
670                                 {
671                                     uint u = decodeUTF();
672                                     if (u == PS || u == LS)
673                                         endOfLine();
674                                 }
675                                 p++;
676                                 continue;
677                             }
678                             break;
679                         }
680                         if (commentToken)
681                         {
682                             t.loc = startLoc;
683                             t.value = TOKcomment;
684                             return;
685                         }
686                         if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr)
687                         {
688                             // if /++ but not /++/
689                             getDocComment(t, lastLine == startLoc.linnum);
690                         }
691                         continue;
692                     }
693                 default:
694                     break;
695                 }
696                 t.value = TOKdiv;
697                 return;
698             case '.':
699                 p++;
700                 if (isdigit(*p))
701                 {
702                     /* Note that we don't allow ._1 and ._ as being
703                      * valid floating point numbers.
704                      */
705                     p--;
706                     t.value = inreal(t);
707                 }
708                 else if (p[0] == '.')
709                 {
710                     if (p[1] == '.')
711                     {
712                         p += 2;
713                         t.value = TOKdotdotdot;
714                     }
715                     else
716                     {
717                         p++;
718                         t.value = TOKslice;
719                     }
720                 }
721                 else
722                     t.value = TOKdot;
723                 return;
724             case '&':
725                 p++;
726                 if (*p == '=')
727                 {
728                     p++;
729                     t.value = TOKandass;
730                 }
731                 else if (*p == '&')
732                 {
733                     p++;
734                     t.value = TOKandand;
735                 }
736                 else
737                     t.value = TOKand;
738                 return;
739             case '|':
740                 p++;
741                 if (*p == '=')
742                 {
743                     p++;
744                     t.value = TOKorass;
745                 }
746                 else if (*p == '|')
747                 {
748                     p++;
749                     t.value = TOKoror;
750                 }
751                 else
752                     t.value = TOKor;
753                 return;
754             case '-':
755                 p++;
756                 if (*p == '=')
757                 {
758                     p++;
759                     t.value = TOKminass;
760                 }
761                 else if (*p == '-')
762                 {
763                     p++;
764                     t.value = TOKminusminus;
765                 }
766                 else
767                     t.value = TOKmin;
768                 return;
769             case '+':
770                 p++;
771                 if (*p == '=')
772                 {
773                     p++;
774                     t.value = TOKaddass;
775                 }
776                 else if (*p == '+')
777                 {
778                     p++;
779                     t.value = TOKplusplus;
780                 }
781                 else
782                     t.value = TOKadd;
783                 return;
784             case '<':
785                 p++;
786                 if (*p == '=')
787                 {
788                     p++;
789                     t.value = TOKle; // <=
790                 }
791                 else if (*p == '<')
792                 {
793                     p++;
794                     if (*p == '=')
795                     {
796                         p++;
797                         t.value = TOKshlass; // <<=
798                     }
799                     else
800                         t.value = TOKshl; // <<
801                 }
802                 else if (*p == '>')
803                 {
804                     p++;
805                     if (*p == '=')
806                     {
807                         p++;
808                         t.value = TOKleg; // <>=
809                     }
810                     else
811                         t.value = TOKlg; // <>
812                 }
813                 else
814                     t.value = TOKlt; // <
815                 return;
816             case '>':
817                 p++;
818                 if (*p == '=')
819                 {
820                     p++;
821                     t.value = TOKge; // >=
822                 }
823                 else if (*p == '>')
824                 {
825                     p++;
826                     if (*p == '=')
827                     {
828                         p++;
829                         t.value = TOKshrass; // >>=
830                     }
831                     else if (*p == '>')
832                     {
833                         p++;
834                         if (*p == '=')
835                         {
836                             p++;
837                             t.value = TOKushrass; // >>>=
838                         }
839                         else
840                             t.value = TOKushr; // >>>
841                     }
842                     else
843                         t.value = TOKshr; // >>
844                 }
845                 else
846                     t.value = TOKgt; // >
847                 return;
848             case '!':
849                 p++;
850                 if (*p == '=')
851                 {
852                     p++;
853                     t.value = TOKnotequal; // !=
854                 }
855                 else if (*p == '<')
856                 {
857                     p++;
858                     if (*p == '>')
859                     {
860                         p++;
861                         if (*p == '=')
862                         {
863                             p++;
864                             t.value = TOKunord; // !<>=
865                         }
866                         else
867                             t.value = TOKue; // !<>
868                     }
869                     else if (*p == '=')
870                     {
871                         p++;
872                         t.value = TOKug; // !<=
873                     }
874                     else
875                         t.value = TOKuge; // !<
876                 }
877                 else if (*p == '>')
878                 {
879                     p++;
880                     if (*p == '=')
881                     {
882                         p++;
883                         t.value = TOKul; // !>=
884                     }
885                     else
886                         t.value = TOKule; // !>
887                 }
888                 else
889                     t.value = TOKnot; // !
890                 return;
891             case '=':
892                 p++;
893                 if (*p == '=')
894                 {
895                     p++;
896                     t.value = TOKequal; // ==
897                 }
898                 else if (*p == '>')
899                 {
900                     p++;
901                     t.value = TOKgoesto; // =>
902                 }
903                 else
904                     t.value = TOKassign; // =
905                 return;
906             case '~':
907                 p++;
908                 if (*p == '=')
909                 {
910                     p++;
911                     t.value = TOKcatass; // ~=
912                 }
913                 else
914                     t.value = TOKtilde; // ~
915                 return;
916             case '^':
917                 p++;
918                 if (*p == '^')
919                 {
920                     p++;
921                     if (*p == '=')
922                     {
923                         p++;
924                         t.value = TOKpowass; // ^^=
925                     }
926                     else
927                         t.value = TOKpow; // ^^
928                 }
929                 else if (*p == '=')
930                 {
931                     p++;
932                     t.value = TOKxorass; // ^=
933                 }
934                 else
935                     t.value = TOKxor; // ^
936                 return;
937             case '(':
938                 p++;
939                 t.value = TOKlparen;
940                 return;
941             case ')':
942                 p++;
943                 t.value = TOKrparen;
944                 return;
945             case '[':
946                 p++;
947                 t.value = TOKlbracket;
948                 return;
949             case ']':
950                 p++;
951                 t.value = TOKrbracket;
952                 return;
953             case '{':
954                 p++;
955                 t.value = TOKlcurly;
956                 return;
957             case '}':
958                 p++;
959                 t.value = TOKrcurly;
960                 return;
961             case '?':
962                 p++;
963                 t.value = TOKquestion;
964                 return;
965             case ',':
966                 p++;
967                 t.value = TOKcomma;
968                 return;
969             case ';':
970                 p++;
971                 t.value = TOKsemicolon;
972                 return;
973             case ':':
974                 p++;
975                 t.value = TOKcolon;
976                 return;
977             case '$':
978                 p++;
979                 t.value = TOKdollar;
980                 return;
981             case '@':
982                 p++;
983                 t.value = TOKat;
984                 return;
985             case '*':
986                 p++;
987                 if (*p == '=')
988                 {
989                     p++;
990                     t.value = TOKmulass;
991                 }
992                 else
993                     t.value = TOKmul;
994                 return;
995             case '%':
996                 p++;
997                 if (*p == '=')
998                 {
999                     p++;
1000                     t.value = TOKmodass;
1001                 }
1002                 else
1003                     t.value = TOKmod;
1004                 return;
1005             case '#':
1006                 {
1007                     p++;
1008                     Token n;
1009                     scan(&n);
1010                     if (n.value == TOKidentifier && n.ident == Id.line)
1011                     {
1012                         poundLine();
1013                         continue;
1014                     }
1015                     else
1016                     {
1017                         t.value = TOKpound;
1018                         return;
1019                     }
1020                 }
1021             default:
1022                 {
1023                     dchar c = *p;
1024                     if (c & 0x80)
1025                     {
1026                         c = decodeUTF();
1027                         // Check for start of unicode identifier
1028                         if (isUniAlpha(c))
1029                             goto case_ident;
1030                         if (c == PS || c == LS)
1031                         {
1032                             endOfLine();
1033                             p++;
1034                             continue;
1035                         }
1036                     }
1037                     if (c < 0x80 && isprint(c))
1038                         error("character '%c' is not a valid token", c);
1039                     else
1040                         error("character 0x%02x is not a valid token", c);
1041                     p++;
1042                     continue;
1043                 }
1044             }
1045         }
1046     }
1047 
1048     final Token* peek(Token* ct)
1049     {
1050         Token* t;
1051         if (ct.next)
1052             t = ct.next;
1053         else
1054         {
1055             t = Token.alloc();
1056             scan(t);
1057             ct.next = t;
1058         }
1059         return t;
1060     }
1061 
1062     /*********************************
1063      * tk is on the opening (.
1064      * Look ahead and return token that is past the closing ).
1065      */
1066     final Token* peekPastParen(Token* tk)
1067     {
1068         //printf("peekPastParen()\n");
1069         int parens = 1;
1070         int curlynest = 0;
1071         while (1)
1072         {
1073             tk = peek(tk);
1074             //tk->print();
1075             switch (tk.value)
1076             {
1077             case TOKlparen:
1078                 parens++;
1079                 continue;
1080             case TOKrparen:
1081                 --parens;
1082                 if (parens)
1083                     continue;
1084                 tk = peek(tk);
1085                 break;
1086             case TOKlcurly:
1087                 curlynest++;
1088                 continue;
1089             case TOKrcurly:
1090                 if (--curlynest >= 0)
1091                     continue;
1092                 break;
1093             case TOKsemicolon:
1094                 if (curlynest)
1095                     continue;
1096                 break;
1097             case TOKeof:
1098                 break;
1099             default:
1100                 continue;
1101             }
1102             return tk;
1103         }
1104     }
1105 
1106     /*******************************************
1107      * Parse escape sequence.
1108      */
1109     final uint escapeSequence()
1110     {
1111         uint c = *p;
1112         int ndigits;
1113         switch (c)
1114         {
1115         case '\'':
1116         case '"':
1117         case '?':
1118         case '\\':
1119         Lconsume:
1120             p++;
1121             break;
1122         case 'a':
1123             c = 7;
1124             goto Lconsume;
1125         case 'b':
1126             c = 8;
1127             goto Lconsume;
1128         case 'f':
1129             c = 12;
1130             goto Lconsume;
1131         case 'n':
1132             c = 10;
1133             goto Lconsume;
1134         case 'r':
1135             c = 13;
1136             goto Lconsume;
1137         case 't':
1138             c = 9;
1139             goto Lconsume;
1140         case 'v':
1141             c = 11;
1142             goto Lconsume;
1143         case 'u':
1144             ndigits = 4;
1145             goto Lhex;
1146         case 'U':
1147             ndigits = 8;
1148             goto Lhex;
1149         case 'x':
1150             ndigits = 2;
1151         Lhex:
1152             p++;
1153             c = *p;
1154             if (ishex(cast(char)c))
1155             {
1156                 uint v = 0;
1157                 int n = 0;
1158                 while (1)
1159                 {
1160                     if (isdigit(cast(char)c))
1161                         c -= '0';
1162                     else if (islower(c))
1163                         c -= 'a' - 10;
1164                     else
1165                         c -= 'A' - 10;
1166                     v = v * 16 + c;
1167                     c = *++p;
1168                     if (++n == ndigits)
1169                         break;
1170                     if (!ishex(cast(char)c))
1171                     {
1172                         error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1173                         break;
1174                     }
1175                 }
1176                 if (ndigits != 2 && !utf_isValidDchar(v))
1177                 {
1178                     error("invalid UTF character \\U%08x", v);
1179                     v = '?'; // recover with valid UTF character
1180                 }
1181                 c = v;
1182             }
1183             else
1184                 error("undefined escape hex sequence \\%c", c);
1185             break;
1186         case '&':
1187             // named character entity
1188             for (const idstart = ++p; 1; p++)
1189             {
1190                 switch (*p)
1191                 {
1192                 case ';':
1193                     c = HtmlNamedEntity(idstart, p - idstart);
1194                     if (c == ~0)
1195                     {
1196                         error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart);
1197                         c = ' ';
1198                     }
1199                     p++;
1200                     break;
1201                 default:
1202                     if (isalpha(*p) || (p != idstart && isdigit(*p)))
1203                         continue;
1204                     error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart);
1205                     break;
1206                 }
1207                 break;
1208             }
1209             break;
1210         case 0:
1211         case 0x1A:
1212             // end of file
1213             c = '\\';
1214             break;
1215         default:
1216             if (isoctal(cast(char)c))
1217             {
1218                 uint v = 0;
1219                 int n = 0;
1220                 do
1221                 {
1222                     v = v * 8 + (c - '0');
1223                     c = *++p;
1224                 }
1225                 while (++n < 3 && isoctal(cast(char)c));
1226                 c = v;
1227                 if (c > 0xFF)
1228                     error("escape octal sequence \\%03o is larger than \\377", c);
1229             }
1230             else
1231                 error("undefined escape sequence \\%c", c);
1232             break;
1233         }
1234         return c;
1235     }
1236 
1237     /**************************************
1238      */
1239     final TOK wysiwygStringConstant(Token* t, int tc)
1240     {
1241         Loc start = loc();
1242         p++;
1243         stringbuffer.reset();
1244         while (1)
1245         {
1246             dchar c = *p++;
1247             switch (c)
1248             {
1249             case '\n':
1250                 endOfLine();
1251                 break;
1252             case '\r':
1253                 if (*p == '\n')
1254                     continue;
1255                 // ignore
1256                 c = '\n'; // treat EndOfLine as \n character
1257                 endOfLine();
1258                 break;
1259             case 0:
1260             case 0x1A:
1261                 error("unterminated string constant starting at %s", start.toChars());
1262                 t.setString();
1263                 return TOKstring;
1264             case '"':
1265             case '`':
1266                 if (c == tc)
1267                 {
1268                     t.setString(stringbuffer);
1269                     stringPostfix(t);
1270                     return TOKstring;
1271                 }
1272                 break;
1273             default:
1274                 if (c & 0x80)
1275                 {
1276                     p--;
1277                     const u = decodeUTF();
1278                     p++;
1279                     if (u == PS || u == LS)
1280                         endOfLine();
1281                     stringbuffer.writeUTF8(u);
1282                     continue;
1283                 }
1284                 break;
1285             }
1286             stringbuffer.writeByte(c);
1287         }
1288     }
1289 
1290     /**************************************
1291      * Lex hex strings:
1292      *      x"0A ae 34FE BD"
1293      */
1294     final TOK hexStringConstant(Token* t)
1295     {
1296         Loc start = loc();
1297         uint n = 0;
1298         uint v = ~0; // dead assignment, needed to suppress warning
1299         p++;
1300         stringbuffer.reset();
1301         while (1)
1302         {
1303             dchar c = *p++;
1304             switch (c)
1305             {
1306             case ' ':
1307             case '\t':
1308             case '\v':
1309             case '\f':
1310                 continue; // skip white space
1311             case '\r':
1312                 if (*p == '\n')
1313                     continue; // ignore '\r' if followed by '\n'
1314                 // Treat isolated '\r' as if it were a '\n'
1315                 goto case '\n';
1316             case '\n':
1317                 endOfLine();
1318                 continue;
1319             case 0:
1320             case 0x1A:
1321                 error("unterminated string constant starting at %s", start.toChars());
1322                 t.setString();
1323                 return TOKxstring;
1324             case '"':
1325                 if (n & 1)
1326                 {
1327                     error("odd number (%d) of hex characters in hex string", n);
1328                     stringbuffer.writeByte(v);
1329                 }
1330                 t.setString(stringbuffer);
1331                 stringPostfix(t);
1332                 return TOKxstring;
1333             default:
1334                 if (c >= '0' && c <= '9')
1335                     c -= '0';
1336                 else if (c >= 'a' && c <= 'f')
1337                     c -= 'a' - 10;
1338                 else if (c >= 'A' && c <= 'F')
1339                     c -= 'A' - 10;
1340                 else if (c & 0x80)
1341                 {
1342                     p--;
1343                     const u = decodeUTF();
1344                     p++;
1345                     if (u == PS || u == LS)
1346                         endOfLine();
1347                     else
1348                         error("non-hex character \\u%04x in hex string", u);
1349                 }
1350                 else
1351                     error("non-hex character '%c' in hex string", c);
1352                 if (n & 1)
1353                 {
1354                     v = (v << 4) | c;
1355                     stringbuffer.writeByte(v);
1356                 }
1357                 else
1358                     v = c;
1359                 n++;
1360                 break;
1361             }
1362         }
1363         assert(0); // see bug 15731
1364     }
1365 
1366     /**************************************
1367      * Lex delimited strings:
1368      *      q"(foo(xxx))"   // "foo(xxx)"
1369      *      q"[foo(]"       // "foo("
1370      *      q"/foo]/"       // "foo]"
1371      *      q"HERE
1372      *      foo
1373      *      HERE"           // "foo\n"
1374      * Input:
1375      *      p is on the "
1376      */
1377     final TOK delimitedStringConstant(Token* t)
1378     {
1379         Loc start = loc();
1380         dchar delimleft = 0;
1381         dchar delimright = 0;
1382         uint nest = 1;
1383         uint nestcount = ~0; // dead assignment, needed to suppress warning
1384         Identifier hereid = null;
1385         uint blankrol = 0;
1386         uint startline = 0;
1387         p++;
1388         stringbuffer.reset();
1389         while (1)
1390         {
1391             dchar c = *p++;
1392             //printf("c = '%c'\n", c);
1393             switch (c)
1394             {
1395             case '\n':
1396             Lnextline:
1397                 endOfLine();
1398                 startline = 1;
1399                 if (blankrol)
1400                 {
1401                     blankrol = 0;
1402                     continue;
1403                 }
1404                 if (hereid)
1405                 {
1406                     stringbuffer.writeUTF8(c);
1407                     continue;
1408                 }
1409                 break;
1410             case '\r':
1411                 if (*p == '\n')
1412                     continue;
1413                 // ignore
1414                 c = '\n'; // treat EndOfLine as \n character
1415                 goto Lnextline;
1416             case 0:
1417             case 0x1A:
1418                 error("unterminated delimited string constant starting at %s", start.toChars());
1419                 t.setString();
1420                 return TOKstring;
1421             default:
1422                 if (c & 0x80)
1423                 {
1424                     p--;
1425                     c = decodeUTF();
1426                     p++;
1427                     if (c == PS || c == LS)
1428                         goto Lnextline;
1429                 }
1430                 break;
1431             }
1432             if (delimleft == 0)
1433             {
1434                 delimleft = c;
1435                 nest = 1;
1436                 nestcount = 1;
1437                 if (c == '(')
1438                     delimright = ')';
1439                 else if (c == '{')
1440                     delimright = '}';
1441                 else if (c == '[')
1442                     delimright = ']';
1443                 else if (c == '<')
1444                     delimright = '>';
1445                 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1446                 {
1447                     // Start of identifier; must be a heredoc
1448                     Token tok;
1449                     p--;
1450                     scan(&tok); // read in heredoc identifier
1451                     if (tok.value != TOKidentifier)
1452                     {
1453                         error("identifier expected for heredoc, not %s", tok.toChars());
1454                         delimright = c;
1455                     }
1456                     else
1457                     {
1458                         hereid = tok.ident;
1459                         //printf("hereid = '%s'\n", hereid->toChars());
1460                         blankrol = 1;
1461                     }
1462                     nest = 0;
1463                 }
1464                 else
1465                 {
1466                     delimright = c;
1467                     nest = 0;
1468                     if (isspace(c))
1469                         error("delimiter cannot be whitespace");
1470                 }
1471             }
1472             else
1473             {
1474                 if (blankrol)
1475                 {
1476                     error("heredoc rest of line should be blank");
1477                     blankrol = 0;
1478                     continue;
1479                 }
1480                 if (nest == 1)
1481                 {
1482                     if (c == delimleft)
1483                         nestcount++;
1484                     else if (c == delimright)
1485                     {
1486                         nestcount--;
1487                         if (nestcount == 0)
1488                             goto Ldone;
1489                     }
1490                 }
1491                 else if (c == delimright)
1492                     goto Ldone;
1493                 if (startline && isalpha(c) && hereid)
1494                 {
1495                     Token tok;
1496                     auto psave = p;
1497                     p--;
1498                     scan(&tok); // read in possible heredoc identifier
1499                     //printf("endid = '%s'\n", tok.ident->toChars());
1500                     if (tok.value == TOKidentifier && tok.ident.equals(hereid))
1501                     {
1502                         /* should check that rest of line is blank
1503                          */
1504                         goto Ldone;
1505                     }
1506                     p = psave;
1507                 }
1508                 stringbuffer.writeUTF8(c);
1509                 startline = 0;
1510             }
1511         }
1512     Ldone:
1513         if (*p == '"')
1514             p++;
1515         else if (hereid)
1516             error("delimited string must end in %s\"", hereid.toChars());
1517         else
1518             error("delimited string must end in %c\"", delimright);
1519         t.setString(stringbuffer);
1520         stringPostfix(t);
1521         return TOKstring;
1522     }
1523 
1524     /**************************************
1525      * Lex delimited strings:
1526      *      q{ foo(xxx) } // " foo(xxx) "
1527      *      q{foo(}       // "foo("
1528      *      q{{foo}"}"}   // "{foo}"}""
1529      * Input:
1530      *      p is on the q
1531      */
1532     final TOK tokenStringConstant(Token* t)
1533     {
1534         uint nest = 1;
1535         const start = loc();
1536         const pstart = ++p;
1537         while (1)
1538         {
1539             Token tok;
1540             scan(&tok);
1541             switch (tok.value)
1542             {
1543             case TOKlcurly:
1544                 nest++;
1545                 continue;
1546             case TOKrcurly:
1547                 if (--nest == 0)
1548                 {
1549                     t.setString(pstart, p - 1 - pstart);
1550                     stringPostfix(t);
1551                     return TOKstring;
1552                 }
1553                 continue;
1554             case TOKeof:
1555                 error("unterminated token string constant starting at %s", start.toChars());
1556                 t.setString();
1557                 return TOKstring;
1558             default:
1559                 continue;
1560             }
1561         }
1562     }
1563 
1564     /**************************************
1565      */
1566     final TOK escapeStringConstant(Token* t, int wide)
1567     {
1568         const start = loc();
1569         p++;
1570         stringbuffer.reset();
1571         while (1)
1572         {
1573             dchar c = *p++;
1574             switch (c)
1575             {
1576             case '\\':
1577                 switch (*p)
1578                 {
1579                 case 'u':
1580                 case 'U':
1581                 case '&':
1582                     c = escapeSequence();
1583                     stringbuffer.writeUTF8(c);
1584                     continue;
1585                 default:
1586                     c = escapeSequence();
1587                     break;
1588                 }
1589                 break;
1590             case '\n':
1591                 endOfLine();
1592                 break;
1593             case '\r':
1594                 if (*p == '\n')
1595                     continue;
1596                 // ignore
1597                 c = '\n'; // treat EndOfLine as \n character
1598                 endOfLine();
1599                 break;
1600             case '"':
1601                 t.setString(stringbuffer);
1602                 stringPostfix(t);
1603                 return TOKstring;
1604             case 0:
1605             case 0x1A:
1606                 p--;
1607                 error("unterminated string constant starting at %s", start.toChars());
1608                 t.setString();
1609                 return TOKstring;
1610             default:
1611                 if (c & 0x80)
1612                 {
1613                     p--;
1614                     c = decodeUTF();
1615                     if (c == LS || c == PS)
1616                     {
1617                         c = '\n';
1618                         endOfLine();
1619                     }
1620                     p++;
1621                     stringbuffer.writeUTF8(c);
1622                     continue;
1623                 }
1624                 break;
1625             }
1626             stringbuffer.writeByte(c);
1627         }
1628     }
1629 
1630     /**************************************
1631      */
1632     final TOK charConstant(Token* t)
1633     {
1634         auto tk = TOKcharv;
1635         //printf("Lexer::charConstant\n");
1636         p++;
1637         dchar c = *p++;
1638         switch (c)
1639         {
1640         case '\\':
1641             switch (*p)
1642             {
1643             case 'u':
1644                 t.uns64value = escapeSequence();
1645                 tk = TOKwcharv;
1646                 break;
1647             case 'U':
1648             case '&':
1649                 t.uns64value = escapeSequence();
1650                 tk = TOKdcharv;
1651                 break;
1652             default:
1653                 t.uns64value = escapeSequence();
1654                 break;
1655             }
1656             break;
1657         case '\n':
1658         L1:
1659             endOfLine();
1660             goto case;
1661         case '\r':
1662         case 0:
1663         case 0x1A:
1664         case '\'':
1665             error("unterminated character constant");
1666             t.uns64value = '?';
1667             return tk;
1668         default:
1669             if (c & 0x80)
1670             {
1671                 p--;
1672                 c = decodeUTF();
1673                 p++;
1674                 if (c == LS || c == PS)
1675                     goto L1;
1676                 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1677                     tk = TOKwcharv;
1678                 else
1679                     tk = TOKdcharv;
1680             }
1681             t.uns64value = c;
1682             break;
1683         }
1684         if (*p != '\'')
1685         {
1686             error("unterminated character constant");
1687             t.uns64value = '?';
1688             return tk;
1689         }
1690         p++;
1691         return tk;
1692     }
1693 
1694     /***************************************
1695      * Get postfix of string literal.
1696      */
1697     final void stringPostfix(Token* t)
1698     {
1699         switch (*p)
1700         {
1701         case 'c':
1702         case 'w':
1703         case 'd':
1704             t.postfix = *p;
1705             p++;
1706             break;
1707         default:
1708             t.postfix = 0;
1709             break;
1710         }
1711     }
1712 
1713     /**************************************
1714      * Read in a number.
1715      * If it's an integer, store it in tok.TKutok.Vlong.
1716      *      integers can be decimal, octal or hex
1717      *      Handle the suffixes U, UL, LU, L, etc.
1718      * If it's double, store it in tok.TKutok.Vdouble.
1719      * Returns:
1720      *      TKnum
1721      *      TKdouble,...
1722      */
1723     final TOK number(Token* t)
1724     {
1725         int base = 10;
1726         const start = p;
1727         uinteger_t n = 0; // unsigned >=64 bit integer type
1728         int d;
1729         bool err = false;
1730         bool overflow = false;
1731         dchar c = *p;
1732         if (c == '0')
1733         {
1734             ++p;
1735             c = *p;
1736             switch (c)
1737             {
1738             case '0':
1739             case '1':
1740             case '2':
1741             case '3':
1742             case '4':
1743             case '5':
1744             case '6':
1745             case '7':
1746                 n = c - '0';
1747                 ++p;
1748                 base = 8;
1749                 break;
1750             case 'x':
1751             case 'X':
1752                 ++p;
1753                 base = 16;
1754                 break;
1755             case 'b':
1756             case 'B':
1757                 ++p;
1758                 base = 2;
1759                 break;
1760             case '.':
1761                 if (p[1] == '.')
1762                     goto Ldone;
1763                 // if ".."
1764                 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1765                     goto Ldone;
1766                 // if ".identifier" or ".unicode"
1767                 goto Lreal;
1768                 // '.' is part of current token
1769             case 'i':
1770             case 'f':
1771             case 'F':
1772                 goto Lreal;
1773             case '_':
1774                 ++p;
1775                 base = 8;
1776                 break;
1777             case 'L':
1778                 if (p[1] == 'i')
1779                     goto Lreal;
1780                 break;
1781             default:
1782                 break;
1783             }
1784         }
1785         while (1)
1786         {
1787             c = *p;
1788             switch (c)
1789             {
1790             case '0':
1791             case '1':
1792                 ++p;
1793                 d = c - '0';
1794                 break;
1795             case '2':
1796             case '3':
1797             case '4':
1798             case '5':
1799             case '6':
1800             case '7':
1801                 if (base == 2 && !err)
1802                 {
1803                     error("binary digit expected");
1804                     err = true;
1805                 }
1806                 ++p;
1807                 d = c - '0';
1808                 break;
1809             case '8':
1810             case '9':
1811                 ++p;
1812                 if (base < 10 && !err)
1813                 {
1814                     error("radix %d digit expected, not '%c'", base, c);
1815                     err = true;
1816                 }
1817                 d = c - '0';
1818                 break;
1819             case 'a':
1820             case 'b':
1821             case 'c':
1822             case 'd':
1823             case 'e':
1824             case 'f':
1825             case 'A':
1826             case 'B':
1827             case 'C':
1828             case 'D':
1829             case 'E':
1830             case 'F':
1831                 ++p;
1832                 if (base != 16)
1833                 {
1834                     if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1835                         goto Lreal;
1836                     if (!err)
1837                     {
1838                         error("radix %d digit expected, not '%c'", base, c);
1839                         err = true;
1840                     }
1841                 }
1842                 if (c >= 'a')
1843                     d = c + 10 - 'a';
1844                 else
1845                     d = c + 10 - 'A';
1846                 break;
1847             case 'L':
1848                 if (p[1] == 'i')
1849                     goto Lreal;
1850                 goto Ldone;
1851             case '.':
1852                 if (p[1] == '.')
1853                     goto Ldone;
1854                 // if ".."
1855                 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
1856                     goto Ldone;
1857                 // if ".identifier" or ".unicode"
1858                 goto Lreal;
1859                 // otherwise as part of a floating point literal
1860             case 'p':
1861             case 'P':
1862             case 'i':
1863             Lreal:
1864                 p = start;
1865                 return inreal(t);
1866             case '_':
1867                 ++p;
1868                 continue;
1869             default:
1870                 goto Ldone;
1871             }
1872             // Avoid expensive overflow check if we aren't at risk of overflow
1873             if (n <= 0x0FFF_FFFF_FFFF_FFFFUL)
1874                 n = n * base + d;
1875             else
1876             {
1877                 import core.checkedint : mulu, addu;
1878 
1879                 n = mulu(n, base, overflow);
1880                 n = addu(n, d, overflow);
1881             }
1882         }
1883     Ldone:
1884         if (overflow && !err)
1885         {
1886             error("integer overflow");
1887             err = true;
1888         }
1889         enum FLAGS : int
1890         {
1891             FLAGS_none = 0,
1892             FLAGS_decimal = 1, // decimal
1893             FLAGS_unsigned = 2, // u or U suffix
1894             FLAGS_long = 4, // L suffix
1895         }
1896 
1897         alias FLAGS_none = FLAGS.FLAGS_none;
1898         alias FLAGS_decimal = FLAGS.FLAGS_decimal;
1899         alias FLAGS_unsigned = FLAGS.FLAGS_unsigned;
1900         alias FLAGS_long = FLAGS.FLAGS_long;
1901 
1902         FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
1903         // Parse trailing 'u', 'U', 'l' or 'L' in any combination
1904         const psuffix = p;
1905         while (1)
1906         {
1907             FLAGS f;
1908             switch (*p)
1909             {
1910             case 'U':
1911             case 'u':
1912                 f = FLAGS_unsigned;
1913                 goto L1;
1914             case 'l':
1915                 f = FLAGS_long;
1916                 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
1917                 goto L1;
1918             case 'L':
1919                 f = FLAGS_long;
1920             L1:
1921                 p++;
1922                 if ((flags & f) && !err)
1923                 {
1924                     error("unrecognized token");
1925                     err = true;
1926                 }
1927                 flags = cast(FLAGS)(flags | f);
1928                 continue;
1929             default:
1930                 break;
1931             }
1932             break;
1933         }
1934         if (base == 8 && n >= 8)
1935             error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix);
1936         TOK result;
1937         switch (flags)
1938         {
1939         case FLAGS_none:
1940             /* Octal or Hexadecimal constant.
1941              * First that fits: int, uint, long, ulong
1942              */
1943             if (n & 0x8000000000000000L)
1944                 result = TOKuns64v;
1945             else if (n & 0xFFFFFFFF00000000L)
1946                 result = TOKint64v;
1947             else if (n & 0x80000000)
1948                 result = TOKuns32v;
1949             else
1950                 result = TOKint32v;
1951             break;
1952         case FLAGS_decimal:
1953             /* First that fits: int, long, long long
1954              */
1955             if (n & 0x8000000000000000L)
1956             {
1957                 if (!err)
1958                 {
1959                     error("signed integer overflow");
1960                     err = true;
1961                 }
1962                 result = TOKuns64v;
1963             }
1964             else if (n & 0xFFFFFFFF80000000L)
1965                 result = TOKint64v;
1966             else
1967                 result = TOKint32v;
1968             break;
1969         case FLAGS_unsigned:
1970         case FLAGS_decimal | FLAGS_unsigned:
1971             /* First that fits: uint, ulong
1972              */
1973             if (n & 0xFFFFFFFF00000000L)
1974                 result = TOKuns64v;
1975             else
1976                 result = TOKuns32v;
1977             break;
1978         case FLAGS_decimal | FLAGS_long:
1979             if (n & 0x8000000000000000L)
1980             {
1981                 if (!err)
1982                 {
1983                     error("signed integer overflow");
1984                     err = true;
1985                 }
1986                 result = TOKuns64v;
1987             }
1988             else
1989                 result = TOKint64v;
1990             break;
1991         case FLAGS_long:
1992             if (n & 0x8000000000000000L)
1993                 result = TOKuns64v;
1994             else
1995                 result = TOKint64v;
1996             break;
1997         case FLAGS_unsigned | FLAGS_long:
1998         case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
1999             result = TOKuns64v;
2000             break;
2001         default:
2002             debug
2003             {
2004                 printf("%x\n", flags);
2005             }
2006             assert(0);
2007         }
2008         t.uns64value = n;
2009         return result;
2010     }
2011 
2012     /**************************************
2013      * Read in characters, converting them to real.
2014      * Bugs:
2015      *      Exponent overflow not detected.
2016      *      Too much requested precision is not detected.
2017      */
2018     final TOK inreal(Token* t)
2019     {
2020         //printf("Lexer::inreal()\n");
2021         debug
2022         {
2023             assert(*p == '.' || isdigit(*p));
2024         }
2025         stringbuffer.reset();
2026         auto pstart = p;
2027         bool hex = false;
2028         dchar c = *p++;
2029         // Leading '0x'
2030         if (c == '0')
2031         {
2032             c = *p++;
2033             if (c == 'x' || c == 'X')
2034             {
2035                 hex = true;
2036                 c = *p++;
2037             }
2038         }
2039         // Digits to left of '.'
2040         while (1)
2041         {
2042             if (c == '.')
2043             {
2044                 c = *p++;
2045                 break;
2046             }
2047             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2048             {
2049                 c = *p++;
2050                 continue;
2051             }
2052             break;
2053         }
2054         // Digits to right of '.'
2055         while (1)
2056         {
2057             if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
2058             {
2059                 c = *p++;
2060                 continue;
2061             }
2062             break;
2063         }
2064         if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
2065         {
2066             c = *p++;
2067             if (c == '-' || c == '+')
2068             {
2069                 c = *p++;
2070             }
2071             bool anyexp = false;
2072             while (1)
2073             {
2074                 if (isdigit(c))
2075                 {
2076                     anyexp = true;
2077                     c = *p++;
2078                     continue;
2079                 }
2080                 if (c == '_')
2081                 {
2082                     c = *p++;
2083                     continue;
2084                 }
2085                 if (!anyexp)
2086                     error("missing exponent");
2087                 break;
2088             }
2089         }
2090         else if (hex)
2091             error("exponent required for hex float");
2092         --p;
2093         while (pstart < p)
2094         {
2095             if (*pstart != '_')
2096                 stringbuffer.writeByte(*pstart);
2097             ++pstart;
2098         }
2099         stringbuffer.writeByte(0);
2100         auto sbufptr = cast(const(char)*)stringbuffer.data;
2101         TOK result;
2102         t.float80value = Port.strtold(sbufptr, null);
2103         errno = 0;
2104         switch (*p)
2105         {
2106         case 'F':
2107         case 'f':
2108             // Only interested in errno return
2109             cast(void)Port.strtof(sbufptr, null);
2110             result = TOKfloat32v;
2111             p++;
2112             break;
2113         default:
2114             /* Should do our own strtod(), since dmc and linux gcc
2115              * accept 2.22507e-308, while apple gcc will only take
2116              * 2.22508e-308. Not sure who is right.
2117              */
2118             // Only interested in errno return
2119             cast(void)Port.strtod(sbufptr, null);
2120             result = TOKfloat64v;
2121             break;
2122         case 'l':
2123             error("use 'L' suffix instead of 'l'");
2124             goto case 'L';
2125         case 'L':
2126             result = TOKfloat80v;
2127             p++;
2128             break;
2129         }
2130         if (*p == 'i' || *p == 'I')
2131         {
2132             if (*p == 'I')
2133                 error("use 'i' suffix instead of 'I'");
2134             p++;
2135             switch (result)
2136             {
2137             case TOKfloat32v:
2138                 result = TOKimaginary32v;
2139                 break;
2140             case TOKfloat64v:
2141                 result = TOKimaginary64v;
2142                 break;
2143             case TOKfloat80v:
2144                 result = TOKimaginary80v;
2145                 break;
2146             default:
2147                 break;
2148             }
2149         }
2150         if (errno == ERANGE)
2151         {
2152             const char* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
2153             error(scanloc, "number '%s%s' is not representable", sbufptr, suffix);
2154         }
2155         debug
2156         {
2157             switch (result)
2158             {
2159             case TOKfloat32v:
2160             case TOKfloat64v:
2161             case TOKfloat80v:
2162             case TOKimaginary32v:
2163             case TOKimaginary64v:
2164             case TOKimaginary80v:
2165                 break;
2166             default:
2167                 assert(0);
2168             }
2169         }
2170         return result;
2171     }
2172 
2173     final Loc loc()
2174     {
2175         scanloc.charnum = cast(uint)(1 + p - line);
2176         return scanloc;
2177     }
2178 
2179     final void error(const(char)* format, ...)
2180     {
2181         va_list ap;
2182         va_start(ap, format);
2183         .verror(token.loc, format, ap);
2184         va_end(ap);
2185         errors = true;
2186     }
2187 
2188     final void error(Loc loc, const(char)* format, ...)
2189     {
2190         va_list ap;
2191         va_start(ap, format);
2192         .verror(loc, format, ap);
2193         va_end(ap);
2194         errors = true;
2195     }
2196 
2197     final void deprecation(const(char)* format, ...)
2198     {
2199         va_list ap;
2200         va_start(ap, format);
2201         .vdeprecation(token.loc, format, ap);
2202         va_end(ap);
2203         if (global.params.useDeprecated == 0)
2204             errors = true;
2205     }
2206 
2207     /*********************************************
2208      * parse:
2209      *      #line linnum [filespec]
2210      * also allow __LINE__ for linnum, and __FILE__ for filespec
2211      */
2212     final void poundLine()
2213     {
2214         auto linnum = this.scanloc.linnum;
2215         const(char)* filespec = null;
2216         const loc = this.loc();
2217         Token tok;
2218         scan(&tok);
2219         if (tok.value == TOKint32v || tok.value == TOKint64v)
2220         {
2221             const lin = cast(int)(tok.uns64value - 1);
2222             if (lin != tok.uns64value - 1)
2223                 error("line number %lld out of range", cast(ulong)tok.uns64value);
2224             else
2225                 linnum = lin;
2226         }
2227         else if (tok.value == TOKline)
2228         {
2229         }
2230         else
2231             goto Lerr;
2232         while (1)
2233         {
2234             switch (*p)
2235             {
2236             case 0:
2237             case 0x1A:
2238             case '\n':
2239             Lnewline:
2240                 this.scanloc.linnum = linnum;
2241                 if (filespec)
2242                     this.scanloc.filename = filespec;
2243                 return;
2244             case '\r':
2245                 p++;
2246                 if (*p != '\n')
2247                 {
2248                     p--;
2249                     goto Lnewline;
2250                 }
2251                 continue;
2252             case ' ':
2253             case '\t':
2254             case '\v':
2255             case '\f':
2256                 p++;
2257                 continue;
2258                 // skip white space
2259             case '_':
2260                 if (memcmp(p, "__FILE__".ptr, 8) == 0)
2261                 {
2262                     p += 8;
2263                     filespec = mem.xstrdup(scanloc.filename);
2264                     continue;
2265                 }
2266                 goto Lerr;
2267             case '"':
2268                 if (filespec)
2269                     goto Lerr;
2270                 stringbuffer.reset();
2271                 p++;
2272                 while (1)
2273                 {
2274                     uint c;
2275                     c = *p;
2276                     switch (c)
2277                     {
2278                     case '\n':
2279                     case '\r':
2280                     case 0:
2281                     case 0x1A:
2282                         goto Lerr;
2283                     case '"':
2284                         stringbuffer.writeByte(0);
2285                         filespec = mem.xstrdup(cast(const(char)*)stringbuffer.data);
2286                         p++;
2287                         break;
2288                     default:
2289                         if (c & 0x80)
2290                         {
2291                             uint u = decodeUTF();
2292                             if (u == PS || u == LS)
2293                                 goto Lerr;
2294                         }
2295                         stringbuffer.writeByte(c);
2296                         p++;
2297                         continue;
2298                     }
2299                     break;
2300                 }
2301                 continue;
2302             default:
2303                 if (*p & 0x80)
2304                 {
2305                     uint u = decodeUTF();
2306                     if (u == PS || u == LS)
2307                         goto Lnewline;
2308                 }
2309                 goto Lerr;
2310             }
2311         }
2312     Lerr:
2313         error(loc, "#line integer [\"filespec\"]\\n expected");
2314     }
2315 
2316     /********************************************
2317      * Decode UTF character.
2318      * Issue error messages for invalid sequences.
2319      * Return decoded character, advance p to last character in UTF sequence.
2320      */
2321     final uint decodeUTF()
2322     {
2323         const s = p;
2324         assert(*s & 0x80);
2325         // Check length of remaining string up to 6 UTF-8 characters
2326         size_t len;
2327         for (len = 1; len < 6 && s[len]; len++)
2328         {
2329         }
2330         size_t idx = 0;
2331         dchar u;
2332         const msg = utf_decodeChar(s, len, idx, u);
2333         p += idx - 1;
2334         if (msg)
2335         {
2336             error("%s", msg);
2337         }
2338         return u;
2339     }
2340 
2341     /***************************************************
2342      * Parse doc comment embedded between t->ptr and p.
2343      * Remove trailing blanks and tabs from lines.
2344      * Replace all newlines with \n.
2345      * Remove leading comment character from each line.
2346      * Decide if it's a lineComment or a blockComment.
2347      * Append to previous one for this token.
2348      */
2349     final void getDocComment(Token* t, uint lineComment)
2350     {
2351         /* ct tells us which kind of comment it is: '/', '*', or '+'
2352          */
2353         const ct = t.ptr[2];
2354         /* Start of comment text skips over / * *, / + +, or / / /
2355          */
2356         const(char)* q = t.ptr + 3; // start of comment text
2357         const(char)* qend = p;
2358         if (ct == '*' || ct == '+')
2359             qend -= 2;
2360         /* Scan over initial row of ****'s or ++++'s or ////'s
2361          */
2362         for (; q < qend; q++)
2363         {
2364             if (*q != ct)
2365                 break;
2366         }
2367         /* Remove leading spaces until start of the comment
2368          */
2369         int linestart = 0;
2370         if (ct == '/')
2371         {
2372             while (q < qend && (*q == ' ' || *q == '\t'))
2373                 ++q;
2374         }
2375         else if (q < qend)
2376         {
2377             if (*q == '\r')
2378             {
2379                 ++q;
2380                 if (q < qend && *q == '\n')
2381                     ++q;
2382                 linestart = 1;
2383             }
2384             else if (*q == '\n')
2385             {
2386                 ++q;
2387                 linestart = 1;
2388             }
2389         }
2390         /* Remove trailing row of ****'s or ++++'s
2391          */
2392         if (ct != '/')
2393         {
2394             for (; q < qend; qend--)
2395             {
2396                 if (qend[-1] != ct)
2397                     break;
2398             }
2399         }
2400         /* Comment is now [q .. qend].
2401          * Canonicalize it into buf[].
2402          */
2403         OutBuffer buf;
2404         for (; q < qend; q++)
2405         {
2406             char c = *q;
2407             switch (c)
2408             {
2409             case '*':
2410             case '+':
2411                 if (linestart && c == ct)
2412                 {
2413                     linestart = 0;
2414                     /* Trim preceding whitespace up to preceding \n
2415                      */
2416                     while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2417                         buf.offset--;
2418                     continue;
2419                 }
2420                 break;
2421             case ' ':
2422             case '\t':
2423                 break;
2424             case '\r':
2425                 if (q[1] == '\n')
2426                     continue;
2427                 // skip the \r
2428                 goto Lnewline;
2429             default:
2430                 if (c == 226)
2431                 {
2432                     // If LS or PS
2433                     if (q[1] == 128 && (q[2] == 168 || q[2] == 169))
2434                     {
2435                         q += 2;
2436                         goto Lnewline;
2437                     }
2438                 }
2439                 linestart = 0;
2440                 break;
2441             Lnewline:
2442                 c = '\n'; // replace all newlines with \n
2443                 goto case;
2444             case '\n':
2445                 linestart = 1;
2446                 /* Trim trailing whitespace
2447                  */
2448                 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2449                     buf.offset--;
2450                 break;
2451             }
2452             buf.writeByte(c);
2453         }
2454         /* Trim trailing whitespace (if the last line does not have newline)
2455          */
2456         if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2457         {
2458             while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2459                 buf.offset--;
2460         }
2461         // Always end with a newline
2462         if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2463             buf.writeByte('\n');
2464         buf.writeByte(0);
2465         // It's a line comment if the start of the doc comment comes
2466         // after other non-whitespace on the same line.
2467         const(char)** dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment;
2468         // Combine with previous doc comment, if any
2469         if (*dc)
2470             *dc = combineComments(*dc, cast(const(char)*)buf.data);
2471         else
2472             *dc = cast(const(char)*)buf.extractData();
2473     }
2474 
2475     /********************************************
2476      * Combine two document comments into one,
2477      * separated by a newline.
2478      */
2479     final static const(char)* combineComments(const(char)* c1, const(char)* c2)
2480     {
2481         //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
2482         auto c = c2;
2483         if (c1)
2484         {
2485             c = c1;
2486             if (c2)
2487             {
2488                 size_t len1 = strlen(c1);
2489                 size_t len2 = strlen(c2);
2490                 int insertNewLine = 0;
2491                 if (len1 && c1[len1 - 1] != '\n')
2492                 {
2493                     ++len1;
2494                     insertNewLine = 1;
2495                 }
2496                 auto p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1);
2497                 memcpy(p, c1, len1 - insertNewLine);
2498                 if (insertNewLine)
2499                     p[len1 - 1] = '\n';
2500                 p[len1] = '\n';
2501                 memcpy(p + len1 + 1, c2, len2);
2502                 p[len1 + 1 + len2] = 0;
2503                 c = p;
2504             }
2505         }
2506         return c;
2507     }
2508 
2509 private:
2510     final void endOfLine()
2511     {
2512         scanloc.linnum++;
2513         line = p;
2514     }
2515 }