1 // Compiler implementation of the D programming language 2 // Copyright (c) 1999-2015 by Digital Mars 3 // All Rights Reserved 4 // written by Walter Bright 5 // http://www.digitalmars.com 6 // Distributed under the Boost Software License, Version 1.0. 7 // http://www.boost.org/LICENSE_1_0.txt 8 9 module ddmd.lexer; 10 11 import core.stdc.ctype, core.stdc.errno, core.stdc.stdarg, core.stdc.stdio, core.stdc..string, core.stdc.time; 12 import ddmd.entity, ddmd.errors, ddmd.globals, ddmd.id, ddmd.identifier, ddmd.root.longdouble, ddmd.root.outbuffer, ddmd.root.port, ddmd.root.rmem, ddmd.root.stringtable, ddmd.tokens, ddmd.utf; 13 14 enum LS = 0x2028; 15 // UTF line separator 16 enum PS = 0x2029; 17 /******************************************** 18 * Do our own char maps 19 */ 20 extern (C++) __gshared ubyte[256] cmtable; 21 extern (C++) __gshared const(int) CMoctal = 0x1; 22 extern (C++) __gshared const(int) CMhex = 0x2; 23 extern (C++) __gshared const(int) CMidchar = 0x4; 24 25 extern (C++) bool isoctal(char c) 26 { 27 return (cmtable[c] & CMoctal) != 0; 28 } 29 30 extern (C++) bool ishex(char c) 31 { 32 return (cmtable[c] & CMhex) != 0; 33 } 34 35 extern (C++) bool isidchar(char c) 36 { 37 return (cmtable[c] & CMidchar) != 0; 38 } 39 40 extern (C++) static void cmtable_init() 41 { 42 for (uint c = 0; c < 256; c++) 43 { 44 if ('0' <= c && c <= '7') 45 cmtable[c] |= CMoctal; 46 if (isxdigit(c)) 47 cmtable[c] |= CMhex; 48 if (isalnum(c) || c == '_') 49 cmtable[c] |= CMidchar; 50 } 51 } 52 53 version (unittest) 54 { 55 extern (C++) void unittest_lexer() 56 { 57 //printf("unittest_lexer()\n"); 58 /* Not much here, just trying things out. 59 */ 60 const(char)* text = "int"; 61 scope Lexer lex1 = new Lexer(null, cast(char*)text, 0, text.sizeof, 0, 0); 62 TOK tok; 63 tok = lex1.nextToken(); 64 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32); 65 assert(tok == TOKint32); 66 tok = lex1.nextToken(); 67 assert(tok == TOKeof); 68 tok = lex1.nextToken(); 69 assert(tok == TOKeof); 70 } 71 } 72 73 extern (C++) class Lexer 74 { 75 public: 76 /*************************** Lexer ********************************************/ 77 extern (C++) static __gshared OutBuffer stringbuffer; 78 Loc scanloc; // for error messages 79 const(char)* base; // pointer to start of buffer 80 const(char)* end; // past end of buffer 81 const(char)* p; // current character 82 const(char)* line; // start of current line 83 Token token; 84 int doDocComment; // collect doc comment information 85 int anyToken; // !=0 means seen at least one token 86 int commentToken; // !=0 means comments are TOKcomment's 87 bool errors; // errors occurred during lexing or parsing 88 89 final extern (D) this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, int doDocComment, int commentToken) 90 { 91 scanloc = Loc(filename, 1, 1); 92 //printf("Lexer::Lexer(%p,%d)\n",base,length); 93 //printf("lexer.filename = %s\n", filename); 94 memset(&token, 0, token.sizeof); 95 this.base = base; 96 this.end = base + endoffset; 97 p = base + begoffset; 98 line = p; 99 this.doDocComment = doDocComment; 100 this.anyToken = 0; 101 this.commentToken = commentToken; 102 this.errors = false; 103 //initKeywords(); 104 /* If first line starts with '#!', ignore the line 105 */ 106 if (p[0] == '#' && p[1] == '!') 107 { 108 p += 2; 109 while (1) 110 { 111 char c = *p; 112 switch (c) 113 { 114 case '\n': 115 p++; 116 break; 117 case '\r': 118 p++; 119 if (*p == '\n') 120 p++; 121 break; 122 case 0: 123 case 0x1A: 124 break; 125 default: 126 if (c & 0x80) 127 { 128 uint u = decodeUTF(); 129 if (u == PS || u == LS) 130 break; 131 } 132 p++; 133 continue; 134 } 135 break; 136 } 137 endOfLine(); 138 } 139 } 140 141 final static void initLexer() 142 { 143 cmtable_init(); 144 Identifier.initTable(); 145 Token.initTokens(); 146 version (unittest) 147 { 148 unittest_lexer(); 149 } 150 } 151 152 final TOK nextToken() 153 { 154 if (token.next) 155 { 156 Token* t = token.next; 157 memcpy(&token, t, Token.sizeof); 158 t.free(); 159 } 160 else 161 { 162 scan(&token); 163 } 164 //token.print(); 165 return token.value; 166 } 167 168 /*********************** 169 * Look ahead at next token's value. 170 */ 171 final TOK peekNext() 172 { 173 return peek(&token).value; 174 } 175 176 /*********************** 177 * Look 2 tokens ahead at value. 178 */ 179 final TOK peekNext2() 180 { 181 Token* t = peek(&token); 182 return peek(t).value; 183 } 184 185 /**************************** 186 * Turn next token in buffer into a token. 187 */ 188 final void scan(Token* t) 189 { 190 uint lastLine = scanloc.linnum; 191 Loc startLoc; 192 t.blockComment = null; 193 t.lineComment = null; 194 while (1) 195 { 196 t.ptr = p; 197 //printf("p = %p, *p = '%c'\n",p,*p); 198 t.loc = loc(); 199 switch (*p) 200 { 201 case 0: 202 case 0x1A: 203 t.value = TOKeof; // end of file 204 return; 205 case ' ': 206 case '\t': 207 case '\v': 208 case '\f': 209 p++; 210 continue; 211 // skip white space 212 case '\r': 213 p++; 214 if (*p != '\n') // if CR stands by itself 215 endOfLine(); 216 continue; 217 // skip white space 218 case '\n': 219 p++; 220 endOfLine(); 221 continue; 222 // skip white space 223 case '0': 224 case '1': 225 case '2': 226 case '3': 227 case '4': 228 case '5': 229 case '6': 230 case '7': 231 case '8': 232 case '9': 233 t.value = number(t); 234 return; 235 case '\'': 236 t.value = charConstant(t, 0); 237 return; 238 case 'r': 239 if (p[1] != '"') 240 goto case_ident; 241 p++; 242 case '`': 243 t.value = wysiwygStringConstant(t, *p); 244 return; 245 case 'x': 246 if (p[1] != '"') 247 goto case_ident; 248 p++; 249 t.value = hexStringConstant(t); 250 return; 251 case 'q': 252 if (p[1] == '"') 253 { 254 p++; 255 t.value = delimitedStringConstant(t); 256 return; 257 } 258 else if (p[1] == '{') 259 { 260 p++; 261 t.value = tokenStringConstant(t); 262 return; 263 } 264 else 265 goto case_ident; 266 case '"': 267 t.value = escapeStringConstant(t, 0); 268 return; 269 case 'a': 270 case 'b': 271 case 'c': 272 case 'd': 273 case 'e': 274 case 'f': 275 case 'g': 276 case 'h': 277 case 'i': 278 case 'j': 279 case 'k': 280 case 'l': 281 case 'm': 282 case 'n': 283 case 'o': 284 case 'p': 285 /*case 'q': case 'r':*/ 286 case 's': 287 case 't': 288 case 'u': 289 case 'v': 290 case 'w': 291 /*case 'x':*/ 292 case 'y': 293 case 'z': 294 case 'A': 295 case 'B': 296 case 'C': 297 case 'D': 298 case 'E': 299 case 'F': 300 case 'G': 301 case 'H': 302 case 'I': 303 case 'J': 304 case 'K': 305 case 'L': 306 case 'M': 307 case 'N': 308 case 'O': 309 case 'P': 310 case 'Q': 311 case 'R': 312 case 'S': 313 case 'T': 314 case 'U': 315 case 'V': 316 case 'W': 317 case 'X': 318 case 'Y': 319 case 'Z': 320 case '_': 321 case_ident: 322 { 323 char c; 324 while (1) 325 { 326 c = *++p; 327 if (isidchar(c)) 328 continue; 329 else if (c & 0x80) 330 { 331 const(char)* s = p; 332 uint u = decodeUTF(); 333 if (isUniAlpha(u)) 334 continue; 335 error("char 0x%04x not allowed in identifier", u); 336 p = s; 337 } 338 break; 339 } 340 Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr); 341 t.ident = id; 342 t.value = cast(TOK)id.value; 343 anyToken = 1; 344 if (*t.ptr == '_') // if special identifier token 345 { 346 static __gshared bool initdone = false; 347 static __gshared char[11 + 1] date; 348 static __gshared char[8 + 1] time; 349 static __gshared char[24 + 1] timestamp; 350 if (!initdone) // lazy evaluation 351 { 352 initdone = true; 353 time_t ct; 354 .time(&ct); 355 char* p = ctime(&ct); 356 assert(p); 357 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 358 sprintf(&time[0], "%.8s", p + 11); 359 sprintf(×tamp[0], "%.24s", p); 360 } 361 if (id == Id.DATE) 362 { 363 t.ustring = cast(char*)date; 364 goto Lstr; 365 } 366 else if (id == Id.TIME) 367 { 368 t.ustring = cast(char*)time; 369 goto Lstr; 370 } 371 else if (id == Id.VENDOR) 372 { 373 t.ustring = cast(char*)global.compiler.vendor; 374 goto Lstr; 375 } 376 else if (id == Id.TIMESTAMP) 377 { 378 t.ustring = cast(char*)timestamp; 379 Lstr: 380 t.value = TOKstring; 381 t.postfix = 0; 382 t.len = cast(uint)strlen(cast(char*)t.ustring); 383 } 384 else if (id == Id.VERSIONX) 385 { 386 uint major = 0; 387 uint minor = 0; 388 bool point = false; 389 for (const(char)* p = global._version + 1; 1; p++) 390 { 391 c = *p; 392 if (isdigit(cast(char)c)) 393 minor = minor * 10 + c - '0'; 394 else if (c == '.') 395 { 396 if (point) 397 break; 398 // ignore everything after second '.' 399 point = true; 400 major = minor; 401 minor = 0; 402 } 403 else 404 break; 405 } 406 t.value = TOKint64v; 407 t.uns64value = major * 1000 + minor; 408 } 409 else if (id == Id.EOFX) 410 { 411 t.value = TOKeof; 412 // Advance scanner to end of file 413 while (!(*p == 0 || *p == 0x1A)) 414 p++; 415 } 416 } 417 //printf("t->value = %d\n",t->value); 418 return; 419 } 420 case '/': 421 p++; 422 switch (*p) 423 { 424 case '=': 425 p++; 426 t.value = TOKdivass; 427 return; 428 case '*': 429 p++; 430 startLoc = loc(); 431 while (1) 432 { 433 while (1) 434 { 435 char c = *p; 436 switch (c) 437 { 438 case '/': 439 break; 440 case '\n': 441 endOfLine(); 442 p++; 443 continue; 444 case '\r': 445 p++; 446 if (*p != '\n') 447 endOfLine(); 448 continue; 449 case 0: 450 case 0x1A: 451 error("unterminated /* */ comment"); 452 p = end; 453 t.loc = loc(); 454 t.value = TOKeof; 455 return; 456 default: 457 if (c & 0x80) 458 { 459 uint u = decodeUTF(); 460 if (u == PS || u == LS) 461 endOfLine(); 462 } 463 p++; 464 continue; 465 } 466 break; 467 } 468 p++; 469 if (p[-2] == '*' && p - 3 != t.ptr) 470 break; 471 } 472 if (commentToken) 473 { 474 t.loc = startLoc; 475 t.value = TOKcomment; 476 return; 477 } 478 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 479 { 480 // if /** but not /**/ 481 getDocComment(t, lastLine == startLoc.linnum); 482 } 483 continue; 484 case '/': 485 // do // style comments 486 startLoc = loc(); 487 while (1) 488 { 489 char c = *++p; 490 switch (c) 491 { 492 case '\n': 493 break; 494 case '\r': 495 if (p[1] == '\n') 496 p++; 497 break; 498 case 0: 499 case 0x1A: 500 if (commentToken) 501 { 502 p = end; 503 t.loc = startLoc; 504 t.value = TOKcomment; 505 return; 506 } 507 if (doDocComment && t.ptr[2] == '/') 508 getDocComment(t, lastLine == startLoc.linnum); 509 p = end; 510 t.loc = loc(); 511 t.value = TOKeof; 512 return; 513 default: 514 if (c & 0x80) 515 { 516 uint u = decodeUTF(); 517 if (u == PS || u == LS) 518 break; 519 } 520 continue; 521 } 522 break; 523 } 524 if (commentToken) 525 { 526 p++; 527 endOfLine(); 528 t.loc = startLoc; 529 t.value = TOKcomment; 530 return; 531 } 532 if (doDocComment && t.ptr[2] == '/') 533 getDocComment(t, lastLine == startLoc.linnum); 534 p++; 535 endOfLine(); 536 continue; 537 case '+': 538 { 539 int nest; 540 startLoc = loc(); 541 p++; 542 nest = 1; 543 while (1) 544 { 545 char c = *p; 546 switch (c) 547 { 548 case '/': 549 p++; 550 if (*p == '+') 551 { 552 p++; 553 nest++; 554 } 555 continue; 556 case '+': 557 p++; 558 if (*p == '/') 559 { 560 p++; 561 if (--nest == 0) 562 break; 563 } 564 continue; 565 case '\r': 566 p++; 567 if (*p != '\n') 568 endOfLine(); 569 continue; 570 case '\n': 571 endOfLine(); 572 p++; 573 continue; 574 case 0: 575 case 0x1A: 576 error("unterminated /+ +/ comment"); 577 p = end; 578 t.loc = loc(); 579 t.value = TOKeof; 580 return; 581 default: 582 if (c & 0x80) 583 { 584 uint u = decodeUTF(); 585 if (u == PS || u == LS) 586 endOfLine(); 587 } 588 p++; 589 continue; 590 } 591 break; 592 } 593 if (commentToken) 594 { 595 t.loc = startLoc; 596 t.value = TOKcomment; 597 return; 598 } 599 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 600 { 601 // if /++ but not /++/ 602 getDocComment(t, lastLine == startLoc.linnum); 603 } 604 continue; 605 } 606 default: 607 break; 608 } 609 t.value = TOKdiv; 610 return; 611 case '.': 612 p++; 613 if (isdigit(*p)) 614 { 615 /* Note that we don't allow ._1 and ._ as being 616 * valid floating point numbers. 617 */ 618 p--; 619 t.value = inreal(t); 620 } 621 else if (p[0] == '.') 622 { 623 if (p[1] == '.') 624 { 625 p += 2; 626 t.value = TOKdotdotdot; 627 } 628 else 629 { 630 p++; 631 t.value = TOKslice; 632 } 633 } 634 else 635 t.value = TOKdot; 636 return; 637 case '&': 638 p++; 639 if (*p == '=') 640 { 641 p++; 642 t.value = TOKandass; 643 } 644 else if (*p == '&') 645 { 646 p++; 647 t.value = TOKandand; 648 } 649 else 650 t.value = TOKand; 651 return; 652 case '|': 653 p++; 654 if (*p == '=') 655 { 656 p++; 657 t.value = TOKorass; 658 } 659 else if (*p == '|') 660 { 661 p++; 662 t.value = TOKoror; 663 } 664 else 665 t.value = TOKor; 666 return; 667 case '-': 668 p++; 669 if (*p == '=') 670 { 671 p++; 672 t.value = TOKminass; 673 } 674 else if (*p == '-') 675 { 676 p++; 677 t.value = TOKminusminus; 678 } 679 else 680 t.value = TOKmin; 681 return; 682 case '+': 683 p++; 684 if (*p == '=') 685 { 686 p++; 687 t.value = TOKaddass; 688 } 689 else if (*p == '+') 690 { 691 p++; 692 t.value = TOKplusplus; 693 } 694 else 695 t.value = TOKadd; 696 return; 697 case '<': 698 p++; 699 if (*p == '=') 700 { 701 p++; 702 t.value = TOKle; // <= 703 } 704 else if (*p == '<') 705 { 706 p++; 707 if (*p == '=') 708 { 709 p++; 710 t.value = TOKshlass; // <<= 711 } 712 else 713 t.value = TOKshl; // << 714 } 715 else if (*p == '>') 716 { 717 p++; 718 if (*p == '=') 719 { 720 p++; 721 t.value = TOKleg; // <>= 722 } 723 else 724 t.value = TOKlg; // <> 725 } 726 else 727 t.value = TOKlt; // < 728 return; 729 case '>': 730 p++; 731 if (*p == '=') 732 { 733 p++; 734 t.value = TOKge; // >= 735 } 736 else if (*p == '>') 737 { 738 p++; 739 if (*p == '=') 740 { 741 p++; 742 t.value = TOKshrass; // >>= 743 } 744 else if (*p == '>') 745 { 746 p++; 747 if (*p == '=') 748 { 749 p++; 750 t.value = TOKushrass; // >>>= 751 } 752 else 753 t.value = TOKushr; // >>> 754 } 755 else 756 t.value = TOKshr; // >> 757 } 758 else 759 t.value = TOKgt; // > 760 return; 761 case '!': 762 p++; 763 if (*p == '=') 764 { 765 p++; 766 t.value = TOKnotequal; // != 767 } 768 else if (*p == '<') 769 { 770 p++; 771 if (*p == '>') 772 { 773 p++; 774 if (*p == '=') 775 { 776 p++; 777 t.value = TOKunord; // !<>= 778 } 779 else 780 t.value = TOKue; // !<> 781 } 782 else if (*p == '=') 783 { 784 p++; 785 t.value = TOKug; // !<= 786 } 787 else 788 t.value = TOKuge; // !< 789 } 790 else if (*p == '>') 791 { 792 p++; 793 if (*p == '=') 794 { 795 p++; 796 t.value = TOKul; // !>= 797 } 798 else 799 t.value = TOKule; // !> 800 } 801 else 802 t.value = TOKnot; // ! 803 return; 804 case '=': 805 p++; 806 if (*p == '=') 807 { 808 p++; 809 t.value = TOKequal; // == 810 } 811 else if (*p == '>') 812 { 813 p++; 814 t.value = TOKgoesto; // => 815 } 816 else 817 t.value = TOKassign; // = 818 return; 819 case '~': 820 p++; 821 if (*p == '=') 822 { 823 p++; 824 t.value = TOKcatass; // ~= 825 } 826 else 827 t.value = TOKtilde; // ~ 828 return; 829 case '^': 830 p++; 831 if (*p == '^') 832 { 833 p++; 834 if (*p == '=') 835 { 836 p++; 837 t.value = TOKpowass; // ^^= 838 } 839 else 840 t.value = TOKpow; // ^^ 841 } 842 else if (*p == '=') 843 { 844 p++; 845 t.value = TOKxorass; // ^= 846 } 847 else 848 t.value = TOKxor; // ^ 849 return; 850 case '(': 851 p++; 852 t.value = TOKlparen; 853 return; 854 case ')': 855 p++; 856 t.value = TOKrparen; 857 return; 858 case '[': 859 p++; 860 t.value = TOKlbracket; 861 return; 862 case ']': 863 p++; 864 t.value = TOKrbracket; 865 return; 866 case '{': 867 p++; 868 t.value = TOKlcurly; 869 return; 870 case '}': 871 p++; 872 t.value = TOKrcurly; 873 return; 874 case '?': 875 p++; 876 t.value = TOKquestion; 877 return; 878 case ',': 879 p++; 880 t.value = TOKcomma; 881 return; 882 case ';': 883 p++; 884 t.value = TOKsemicolon; 885 return; 886 case ':': 887 p++; 888 t.value = TOKcolon; 889 return; 890 case '$': 891 p++; 892 t.value = TOKdollar; 893 return; 894 case '@': 895 p++; 896 t.value = TOKat; 897 return; 898 case '*': 899 p++; 900 if (*p == '=') 901 { 902 p++; 903 t.value = TOKmulass; 904 } 905 else 906 t.value = TOKmul; 907 return; 908 case '%': 909 p++; 910 if (*p == '=') 911 { 912 p++; 913 t.value = TOKmodass; 914 } 915 else 916 t.value = TOKmod; 917 return; 918 case '#': 919 { 920 p++; 921 Token n; 922 scan(&n); 923 if (n.value == TOKidentifier && n.ident == Id.line) 924 { 925 poundLine(); 926 continue; 927 } 928 else 929 { 930 t.value = TOKpound; 931 return; 932 } 933 } 934 default: 935 { 936 uint c = *p; 937 if (c & 0x80) 938 { 939 c = decodeUTF(); 940 // Check for start of unicode identifier 941 if (isUniAlpha(c)) 942 goto case_ident; 943 if (c == PS || c == LS) 944 { 945 endOfLine(); 946 p++; 947 continue; 948 } 949 } 950 if (c < 0x80 && isprint(c)) 951 error("character '%c' is not a valid token", c); 952 else 953 error("character 0x%02x is not a valid token", c); 954 p++; 955 continue; 956 } 957 } 958 } 959 } 960 961 final Token* peek(Token* ct) 962 { 963 Token* t; 964 if (ct.next) 965 t = ct.next; 966 else 967 { 968 t = Token.alloc(); 969 scan(t); 970 ct.next = t; 971 } 972 return t; 973 } 974 975 /********************************* 976 * tk is on the opening (. 977 * Look ahead and return token that is past the closing ). 978 */ 979 final Token* peekPastParen(Token* tk) 980 { 981 //printf("peekPastParen()\n"); 982 int parens = 1; 983 int curlynest = 0; 984 while (1) 985 { 986 tk = peek(tk); 987 //tk->print(); 988 switch (tk.value) 989 { 990 case TOKlparen: 991 parens++; 992 continue; 993 case TOKrparen: 994 --parens; 995 if (parens) 996 continue; 997 tk = peek(tk); 998 break; 999 case TOKlcurly: 1000 curlynest++; 1001 continue; 1002 case TOKrcurly: 1003 if (--curlynest >= 0) 1004 continue; 1005 break; 1006 case TOKsemicolon: 1007 if (curlynest) 1008 continue; 1009 break; 1010 case TOKeof: 1011 break; 1012 default: 1013 continue; 1014 } 1015 return tk; 1016 } 1017 } 1018 1019 /******************************************* 1020 * Parse escape sequence. 1021 */ 1022 final uint escapeSequence() 1023 { 1024 uint c = *p; 1025 int n; 1026 int ndigits; 1027 switch (c) 1028 { 1029 case '\'': 1030 case '"': 1031 case '?': 1032 case '\\': 1033 Lconsume: 1034 p++; 1035 break; 1036 case 'a': 1037 c = 7; 1038 goto Lconsume; 1039 case 'b': 1040 c = 8; 1041 goto Lconsume; 1042 case 'f': 1043 c = 12; 1044 goto Lconsume; 1045 case 'n': 1046 c = 10; 1047 goto Lconsume; 1048 case 'r': 1049 c = 13; 1050 goto Lconsume; 1051 case 't': 1052 c = 9; 1053 goto Lconsume; 1054 case 'v': 1055 c = 11; 1056 goto Lconsume; 1057 case 'u': 1058 ndigits = 4; 1059 goto Lhex; 1060 case 'U': 1061 ndigits = 8; 1062 goto Lhex; 1063 case 'x': 1064 ndigits = 2; 1065 Lhex: 1066 p++; 1067 c = *p; 1068 if (ishex(cast(char)c)) 1069 { 1070 uint v; 1071 n = 0; 1072 v = 0; 1073 while (1) 1074 { 1075 if (isdigit(cast(char)c)) 1076 c -= '0'; 1077 else if (islower(c)) 1078 c -= 'a' - 10; 1079 else 1080 c -= 'A' - 10; 1081 v = v * 16 + c; 1082 c = *++p; 1083 if (++n == ndigits) 1084 break; 1085 if (!ishex(cast(char)c)) 1086 { 1087 error("escape hex sequence has %d hex digits instead of %d", n, ndigits); 1088 break; 1089 } 1090 } 1091 if (ndigits != 2 && !utf_isValidDchar(v)) 1092 { 1093 error("invalid UTF character \\U%08x", v); 1094 v = '?'; // recover with valid UTF character 1095 } 1096 c = v; 1097 } 1098 else 1099 error("undefined escape hex sequence \\%c", c); 1100 break; 1101 case '&': 1102 // named character entity 1103 for (const(char)* idstart = ++p; 1; p++) 1104 { 1105 switch (*p) 1106 { 1107 case ';': 1108 c = HtmlNamedEntity(idstart, p - idstart); 1109 if (c == ~0) 1110 { 1111 error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1112 c = ' '; 1113 } 1114 p++; 1115 break; 1116 default: 1117 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1118 continue; 1119 error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1120 break; 1121 } 1122 break; 1123 } 1124 break; 1125 case 0: 1126 case 0x1A: 1127 // end of file 1128 c = '\\'; 1129 break; 1130 default: 1131 if (isoctal(cast(char)c)) 1132 { 1133 uint v; 1134 n = 0; 1135 v = 0; 1136 do 1137 { 1138 v = v * 8 + (c - '0'); 1139 c = *++p; 1140 } 1141 while (++n < 3 && isoctal(cast(char)c)); 1142 c = v; 1143 if (c > 0xFF) 1144 error("escape octal sequence \\%03o is larger than \\377", c); 1145 } 1146 else 1147 error("undefined escape sequence \\%c", c); 1148 break; 1149 } 1150 return c; 1151 } 1152 1153 /************************************** 1154 */ 1155 final TOK wysiwygStringConstant(Token* t, int tc) 1156 { 1157 uint c; 1158 Loc start = loc(); 1159 p++; 1160 stringbuffer.reset(); 1161 while (1) 1162 { 1163 c = *p++; 1164 switch (c) 1165 { 1166 case '\n': 1167 endOfLine(); 1168 break; 1169 case '\r': 1170 if (*p == '\n') 1171 continue; 1172 // ignore 1173 c = '\n'; // treat EndOfLine as \n character 1174 endOfLine(); 1175 break; 1176 case 0: 1177 case 0x1A: 1178 error("unterminated string constant starting at %s", start.toChars()); 1179 t.ustring = cast(char*)""; 1180 t.len = 0; 1181 t.postfix = 0; 1182 return TOKstring; 1183 case '"': 1184 case '`': 1185 if (c == tc) 1186 { 1187 t.len = cast(uint)stringbuffer.offset; 1188 stringbuffer.writeByte(0); 1189 t.ustring = cast(char*)mem.xmalloc(stringbuffer.offset); 1190 memcpy(t.ustring, stringbuffer.data, stringbuffer.offset); 1191 stringPostfix(t); 1192 return TOKstring; 1193 } 1194 break; 1195 default: 1196 if (c & 0x80) 1197 { 1198 p--; 1199 uint u = decodeUTF(); 1200 p++; 1201 if (u == PS || u == LS) 1202 endOfLine(); 1203 stringbuffer.writeUTF8(u); 1204 continue; 1205 } 1206 break; 1207 } 1208 stringbuffer.writeByte(c); 1209 } 1210 } 1211 1212 /************************************** 1213 * Lex hex strings: 1214 * x"0A ae 34FE BD" 1215 */ 1216 final TOK hexStringConstant(Token* t) 1217 { 1218 uint c; 1219 Loc start = loc(); 1220 uint n = 0; 1221 uint v = ~0; // dead assignment, needed to suppress warning 1222 p++; 1223 stringbuffer.reset(); 1224 while (1) 1225 { 1226 c = *p++; 1227 switch (c) 1228 { 1229 case ' ': 1230 case '\t': 1231 case '\v': 1232 case '\f': 1233 continue; 1234 // skip white space 1235 case '\r': 1236 if (*p == '\n') 1237 continue; 1238 // ignore 1239 // Treat isolated '\r' as if it were a '\n' 1240 case '\n': 1241 endOfLine(); 1242 continue; 1243 case 0: 1244 case 0x1A: 1245 error("unterminated string constant starting at %s", start.toChars()); 1246 t.ustring = cast(char*)""; 1247 t.len = 0; 1248 t.postfix = 0; 1249 return TOKxstring; 1250 case '"': 1251 if (n & 1) 1252 { 1253 error("odd number (%d) of hex characters in hex string", n); 1254 stringbuffer.writeByte(v); 1255 } 1256 t.len = cast(uint)stringbuffer.offset; 1257 stringbuffer.writeByte(0); 1258 t.ustring = cast(char*)mem.xmalloc(stringbuffer.offset); 1259 memcpy(t.ustring, stringbuffer.data, stringbuffer.offset); 1260 stringPostfix(t); 1261 return TOKxstring; 1262 default: 1263 if (c >= '0' && c <= '9') 1264 c -= '0'; 1265 else if (c >= 'a' && c <= 'f') 1266 c -= 'a' - 10; 1267 else if (c >= 'A' && c <= 'F') 1268 c -= 'A' - 10; 1269 else if (c & 0x80) 1270 { 1271 p--; 1272 uint u = decodeUTF(); 1273 p++; 1274 if (u == PS || u == LS) 1275 endOfLine(); 1276 else 1277 error("non-hex character \\u%04x in hex string", u); 1278 } 1279 else 1280 error("non-hex character '%c' in hex string", c); 1281 if (n & 1) 1282 { 1283 v = (v << 4) | c; 1284 stringbuffer.writeByte(v); 1285 } 1286 else 1287 v = c; 1288 n++; 1289 break; 1290 } 1291 } 1292 } 1293 1294 /************************************** 1295 * Lex delimited strings: 1296 * q"(foo(xxx))" // "foo(xxx)" 1297 * q"[foo(]" // "foo(" 1298 * q"/foo]/" // "foo]" 1299 * q"HERE 1300 * foo 1301 * HERE" // "foo\n" 1302 * Input: 1303 * p is on the " 1304 */ 1305 final TOK delimitedStringConstant(Token* t) 1306 { 1307 uint c; 1308 Loc start = loc(); 1309 uint delimleft = 0; 1310 uint delimright = 0; 1311 uint nest = 1; 1312 uint nestcount = ~0; // dead assignment, needed to suppress warning 1313 Identifier hereid = null; 1314 uint blankrol = 0; 1315 uint startline = 0; 1316 p++; 1317 stringbuffer.reset(); 1318 while (1) 1319 { 1320 c = *p++; 1321 //printf("c = '%c'\n", c); 1322 switch (c) 1323 { 1324 case '\n': 1325 Lnextline: 1326 endOfLine(); 1327 startline = 1; 1328 if (blankrol) 1329 { 1330 blankrol = 0; 1331 continue; 1332 } 1333 if (hereid) 1334 { 1335 stringbuffer.writeUTF8(c); 1336 continue; 1337 } 1338 break; 1339 case '\r': 1340 if (*p == '\n') 1341 continue; 1342 // ignore 1343 c = '\n'; // treat EndOfLine as \n character 1344 goto Lnextline; 1345 case 0: 1346 case 0x1A: 1347 error("unterminated delimited string constant starting at %s", start.toChars()); 1348 t.ustring = cast(char*)""; 1349 t.len = 0; 1350 t.postfix = 0; 1351 return TOKstring; 1352 default: 1353 if (c & 0x80) 1354 { 1355 p--; 1356 c = decodeUTF(); 1357 p++; 1358 if (c == PS || c == LS) 1359 goto Lnextline; 1360 } 1361 break; 1362 } 1363 if (delimleft == 0) 1364 { 1365 delimleft = c; 1366 nest = 1; 1367 nestcount = 1; 1368 if (c == '(') 1369 delimright = ')'; 1370 else if (c == '{') 1371 delimright = '}'; 1372 else if (c == '[') 1373 delimright = ']'; 1374 else if (c == '<') 1375 delimright = '>'; 1376 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1377 { 1378 // Start of identifier; must be a heredoc 1379 Token tok; 1380 p--; 1381 scan(&tok); // read in heredoc identifier 1382 if (tok.value != TOKidentifier) 1383 { 1384 error("identifier expected for heredoc, not %s", tok.toChars()); 1385 delimright = c; 1386 } 1387 else 1388 { 1389 hereid = tok.ident; 1390 //printf("hereid = '%s'\n", hereid->toChars()); 1391 blankrol = 1; 1392 } 1393 nest = 0; 1394 } 1395 else 1396 { 1397 delimright = c; 1398 nest = 0; 1399 if (isspace(c)) 1400 error("delimiter cannot be whitespace"); 1401 } 1402 } 1403 else 1404 { 1405 if (blankrol) 1406 { 1407 error("heredoc rest of line should be blank"); 1408 blankrol = 0; 1409 continue; 1410 } 1411 if (nest == 1) 1412 { 1413 if (c == delimleft) 1414 nestcount++; 1415 else if (c == delimright) 1416 { 1417 nestcount--; 1418 if (nestcount == 0) 1419 goto Ldone; 1420 } 1421 } 1422 else if (c == delimright) 1423 goto Ldone; 1424 if (startline && isalpha(c) && hereid) 1425 { 1426 Token tok; 1427 const(char)* psave = p; 1428 p--; 1429 scan(&tok); // read in possible heredoc identifier 1430 //printf("endid = '%s'\n", tok.ident->toChars()); 1431 if (tok.value == TOKidentifier && tok.ident.equals(hereid)) 1432 { 1433 /* should check that rest of line is blank 1434 */ 1435 goto Ldone; 1436 } 1437 p = psave; 1438 } 1439 stringbuffer.writeUTF8(c); 1440 startline = 0; 1441 } 1442 } 1443 Ldone: 1444 if (*p == '"') 1445 p++; 1446 else if (hereid) 1447 error("delimited string must end in %s\"", hereid.toChars()); 1448 else 1449 error("delimited string must end in %c\"", delimright); 1450 t.len = cast(uint)stringbuffer.offset; 1451 stringbuffer.writeByte(0); 1452 t.ustring = cast(char*)mem.xmalloc(stringbuffer.offset); 1453 memcpy(t.ustring, stringbuffer.data, stringbuffer.offset); 1454 stringPostfix(t); 1455 return TOKstring; 1456 } 1457 1458 /************************************** 1459 * Lex delimited strings: 1460 * q{ foo(xxx) } // " foo(xxx) " 1461 * q{foo(} // "foo(" 1462 * q{{foo}"}"} // "{foo}"}"" 1463 * Input: 1464 * p is on the q 1465 */ 1466 final TOK tokenStringConstant(Token* t) 1467 { 1468 uint nest = 1; 1469 Loc start = loc(); 1470 const(char)* pstart = ++p; 1471 while (1) 1472 { 1473 Token tok; 1474 scan(&tok); 1475 switch (tok.value) 1476 { 1477 case TOKlcurly: 1478 nest++; 1479 continue; 1480 case TOKrcurly: 1481 if (--nest == 0) 1482 { 1483 t.len = cast(uint)(p - 1 - pstart); 1484 t.ustring = cast(char*)mem.xmalloc(t.len + 1); 1485 memcpy(t.ustring, pstart, t.len); 1486 t.ustring[t.len] = 0; 1487 stringPostfix(t); 1488 return TOKstring; 1489 } 1490 continue; 1491 case TOKeof: 1492 error("unterminated token string constant starting at %s", start.toChars()); 1493 t.ustring = cast(char*)""; 1494 t.len = 0; 1495 t.postfix = 0; 1496 return TOKstring; 1497 default: 1498 continue; 1499 } 1500 } 1501 } 1502 1503 /************************************** 1504 */ 1505 final TOK escapeStringConstant(Token* t, int wide) 1506 { 1507 uint c; 1508 Loc start = loc(); 1509 p++; 1510 stringbuffer.reset(); 1511 while (1) 1512 { 1513 c = *p++; 1514 switch (c) 1515 { 1516 case '\\': 1517 switch (*p) 1518 { 1519 case 'u': 1520 case 'U': 1521 case '&': 1522 c = escapeSequence(); 1523 stringbuffer.writeUTF8(c); 1524 continue; 1525 default: 1526 c = escapeSequence(); 1527 break; 1528 } 1529 break; 1530 case '\n': 1531 endOfLine(); 1532 break; 1533 case '\r': 1534 if (*p == '\n') 1535 continue; 1536 // ignore 1537 c = '\n'; // treat EndOfLine as \n character 1538 endOfLine(); 1539 break; 1540 case '"': 1541 t.len = cast(uint)stringbuffer.offset; 1542 stringbuffer.writeByte(0); 1543 t.ustring = cast(char*)mem.xmalloc(stringbuffer.offset); 1544 memcpy(t.ustring, stringbuffer.data, stringbuffer.offset); 1545 stringPostfix(t); 1546 return TOKstring; 1547 case 0: 1548 case 0x1A: 1549 p--; 1550 error("unterminated string constant starting at %s", start.toChars()); 1551 t.ustring = cast(char*)""; 1552 t.len = 0; 1553 t.postfix = 0; 1554 return TOKstring; 1555 default: 1556 if (c & 0x80) 1557 { 1558 p--; 1559 c = decodeUTF(); 1560 if (c == LS || c == PS) 1561 { 1562 c = '\n'; 1563 endOfLine(); 1564 } 1565 p++; 1566 stringbuffer.writeUTF8(c); 1567 continue; 1568 } 1569 break; 1570 } 1571 stringbuffer.writeByte(c); 1572 } 1573 } 1574 1575 /************************************** 1576 */ 1577 final TOK charConstant(Token* t, int wide) 1578 { 1579 uint c; 1580 TOK tk = TOKcharv; 1581 //printf("Lexer::charConstant\n"); 1582 p++; 1583 c = *p++; 1584 switch (c) 1585 { 1586 case '\\': 1587 switch (*p) 1588 { 1589 case 'u': 1590 t.uns64value = escapeSequence(); 1591 tk = TOKwcharv; 1592 break; 1593 case 'U': 1594 case '&': 1595 t.uns64value = escapeSequence(); 1596 tk = TOKdcharv; 1597 break; 1598 default: 1599 t.uns64value = escapeSequence(); 1600 break; 1601 } 1602 break; 1603 case '\n': 1604 L1: 1605 endOfLine(); 1606 case '\r': 1607 case 0: 1608 case 0x1A: 1609 case '\'': 1610 error("unterminated character constant"); 1611 t.uns64value = '?'; 1612 return tk; 1613 default: 1614 if (c & 0x80) 1615 { 1616 p--; 1617 c = decodeUTF(); 1618 p++; 1619 if (c == LS || c == PS) 1620 goto L1; 1621 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1622 tk = TOKwcharv; 1623 else 1624 tk = TOKdcharv; 1625 } 1626 t.uns64value = c; 1627 break; 1628 } 1629 if (*p != '\'') 1630 { 1631 error("unterminated character constant"); 1632 t.uns64value = '?'; 1633 return tk; 1634 } 1635 p++; 1636 return tk; 1637 } 1638 1639 /*************************************** 1640 * Get postfix of string literal. 1641 */ 1642 final void stringPostfix(Token* t) 1643 { 1644 switch (*p) 1645 { 1646 case 'c': 1647 case 'w': 1648 case 'd': 1649 t.postfix = *p; 1650 p++; 1651 break; 1652 default: 1653 t.postfix = 0; 1654 break; 1655 } 1656 } 1657 1658 /************************************** 1659 * Read in a number. 1660 * If it's an integer, store it in tok.TKutok.Vlong. 1661 * integers can be decimal, octal or hex 1662 * Handle the suffixes U, UL, LU, L, etc. 1663 * If it's double, store it in tok.TKutok.Vdouble. 1664 * Returns: 1665 * TKnum 1666 * TKdouble,... 1667 */ 1668 final TOK number(Token* t) 1669 { 1670 int base = 10; 1671 const(char)* start = p; 1672 uint c; 1673 uinteger_t n = 0; // unsigned >=64 bit integer type 1674 int d; 1675 bool err = false; 1676 bool overflow = false; 1677 c = *p; 1678 if (c == '0') 1679 { 1680 ++p; 1681 c = *p; 1682 switch (c) 1683 { 1684 case '0': 1685 case '1': 1686 case '2': 1687 case '3': 1688 case '4': 1689 case '5': 1690 case '6': 1691 case '7': 1692 n = c - '0'; 1693 ++p; 1694 base = 8; 1695 break; 1696 case 'x': 1697 case 'X': 1698 ++p; 1699 base = 16; 1700 break; 1701 case 'b': 1702 case 'B': 1703 ++p; 1704 base = 2; 1705 break; 1706 case '.': 1707 if (p[1] == '.') 1708 goto Ldone; 1709 // if ".." 1710 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1711 goto Ldone; 1712 // if ".identifier" or ".unicode" 1713 goto Lreal; 1714 // '.' is part of current token 1715 case 'i': 1716 case 'f': 1717 case 'F': 1718 goto Lreal; 1719 case '_': 1720 ++p; 1721 base = 8; 1722 break; 1723 case 'L': 1724 if (p[1] == 'i') 1725 goto Lreal; 1726 break; 1727 default: 1728 break; 1729 } 1730 } 1731 while (1) 1732 { 1733 c = *p; 1734 switch (c) 1735 { 1736 case '0': 1737 case '1': 1738 ++p; 1739 d = c - '0'; 1740 break; 1741 case '2': 1742 case '3': 1743 case '4': 1744 case '5': 1745 case '6': 1746 case '7': 1747 if (base == 2 && !err) 1748 { 1749 error("binary digit expected"); 1750 err = true; 1751 } 1752 ++p; 1753 d = c - '0'; 1754 break; 1755 case '8': 1756 case '9': 1757 ++p; 1758 if (base < 10 && !err) 1759 { 1760 error("radix %d digit expected, not '%c'", base, c); 1761 err = true; 1762 } 1763 d = c - '0'; 1764 break; 1765 case 'a': 1766 case 'b': 1767 case 'c': 1768 case 'd': 1769 case 'e': 1770 case 'f': 1771 case 'A': 1772 case 'B': 1773 case 'C': 1774 case 'D': 1775 case 'E': 1776 case 'F': 1777 ++p; 1778 if (base != 16) 1779 { 1780 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1781 goto Lreal; 1782 if (!err) 1783 { 1784 error("radix %d digit expected, not '%c'", base, c); 1785 err = true; 1786 } 1787 } 1788 if (c >= 'a') 1789 d = c + 10 - 'a'; 1790 else 1791 d = c + 10 - 'A'; 1792 break; 1793 case 'L': 1794 if (p[1] == 'i') 1795 goto Lreal; 1796 goto Ldone; 1797 case '.': 1798 if (p[1] == '.') 1799 goto Ldone; 1800 // if ".." 1801 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1802 goto Ldone; 1803 // if ".identifier" or ".unicode" 1804 goto Lreal; 1805 // otherwise as part of a floating point literal 1806 case 'p': 1807 case 'P': 1808 case 'i': 1809 Lreal: 1810 p = start; 1811 return inreal(t); 1812 case '_': 1813 ++p; 1814 continue; 1815 default: 1816 goto Ldone; 1817 } 1818 uinteger_t n2 = n * base; 1819 if ((n2 / base != n || n2 + d < n)) 1820 { 1821 overflow = true; 1822 } 1823 n = n2 + d; 1824 // if n needs more than 64 bits 1825 if (n.sizeof > 8 && n > 0xFFFFFFFFFFFFFFFFUL) 1826 { 1827 overflow = true; 1828 } 1829 } 1830 Ldone: 1831 if (overflow && !err) 1832 { 1833 error("integer overflow"); 1834 err = true; 1835 } 1836 enum FLAGS : int 1837 { 1838 FLAGS_none = 0, 1839 FLAGS_decimal = 1, // decimal 1840 FLAGS_unsigned = 2, // u or U suffix 1841 FLAGS_long = 4, // L suffix 1842 } 1843 1844 alias FLAGS_none = FLAGS.FLAGS_none; 1845 alias FLAGS_decimal = FLAGS.FLAGS_decimal; 1846 alias FLAGS_unsigned = FLAGS.FLAGS_unsigned; 1847 alias FLAGS_long = FLAGS.FLAGS_long; 1848 ; 1849 FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none; 1850 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 1851 const(char)* psuffix = p; 1852 while (1) 1853 { 1854 char f; 1855 switch (*p) 1856 { 1857 case 'U': 1858 case 'u': 1859 f = FLAGS_unsigned; 1860 goto L1; 1861 case 'l': 1862 f = FLAGS_long; 1863 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 1864 goto L1; 1865 case 'L': 1866 f = FLAGS_long; 1867 L1: 1868 p++; 1869 if ((flags & f) && !err) 1870 { 1871 error("unrecognized token"); 1872 err = true; 1873 } 1874 flags = cast(FLAGS)(flags | f); 1875 continue; 1876 default: 1877 break; 1878 } 1879 break; 1880 } 1881 if (base == 8 && n >= 8) 1882 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix); 1883 TOK result; 1884 switch (flags) 1885 { 1886 case FLAGS_none: 1887 /* Octal or Hexadecimal constant. 1888 * First that fits: int, uint, long, ulong 1889 */ 1890 if (n & 0x8000000000000000L) 1891 result = TOKuns64v; 1892 else if (n & 0xFFFFFFFF00000000L) 1893 result = TOKint64v; 1894 else if (n & 0x80000000) 1895 result = TOKuns32v; 1896 else 1897 result = TOKint32v; 1898 break; 1899 case FLAGS_decimal: 1900 /* First that fits: int, long, long long 1901 */ 1902 if (n & 0x8000000000000000L) 1903 { 1904 if (!err) 1905 { 1906 error("signed integer overflow"); 1907 err = true; 1908 } 1909 result = TOKuns64v; 1910 } 1911 else if (n & 0xFFFFFFFF80000000L) 1912 result = TOKint64v; 1913 else 1914 result = TOKint32v; 1915 break; 1916 case FLAGS_unsigned: 1917 case FLAGS_decimal | FLAGS_unsigned: 1918 /* First that fits: uint, ulong 1919 */ 1920 if (n & 0xFFFFFFFF00000000L) 1921 result = TOKuns64v; 1922 else 1923 result = TOKuns32v; 1924 break; 1925 case FLAGS_decimal | FLAGS_long: 1926 if (n & 0x8000000000000000L) 1927 { 1928 if (!err) 1929 { 1930 error("signed integer overflow"); 1931 err = true; 1932 } 1933 result = TOKuns64v; 1934 } 1935 else 1936 result = TOKint64v; 1937 break; 1938 case FLAGS_long: 1939 if (n & 0x8000000000000000L) 1940 result = TOKuns64v; 1941 else 1942 result = TOKint64v; 1943 break; 1944 case FLAGS_unsigned | FLAGS_long: 1945 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: 1946 result = TOKuns64v; 1947 break; 1948 default: 1949 debug 1950 { 1951 printf("%x\n", flags); 1952 } 1953 assert(0); 1954 } 1955 t.uns64value = n; 1956 return result; 1957 } 1958 1959 /************************************** 1960 * Read in characters, converting them to real. 1961 * Bugs: 1962 * Exponent overflow not detected. 1963 * Too much requested precision is not detected. 1964 */ 1965 final TOK inreal(Token* t) 1966 { 1967 //printf("Lexer::inreal()\n"); 1968 debug 1969 { 1970 assert(*p == '.' || isdigit(*p)); 1971 } 1972 stringbuffer.reset(); 1973 const(char)* pstart = p; 1974 char hex = 0; 1975 uint c = *p++; 1976 // Leading '0x' 1977 if (c == '0') 1978 { 1979 c = *p++; 1980 if (c == 'x' || c == 'X') 1981 { 1982 hex = true; 1983 c = *p++; 1984 } 1985 } 1986 // Digits to left of '.' 1987 while (1) 1988 { 1989 if (c == '.') 1990 { 1991 c = *p++; 1992 break; 1993 } 1994 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 1995 { 1996 c = *p++; 1997 continue; 1998 } 1999 break; 2000 } 2001 // Digits to right of '.' 2002 while (1) 2003 { 2004 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2005 { 2006 c = *p++; 2007 continue; 2008 } 2009 break; 2010 } 2011 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2012 { 2013 c = *p++; 2014 if (c == '-' || c == '+') 2015 { 2016 c = *p++; 2017 } 2018 bool anyexp = false; 2019 while (1) 2020 { 2021 if (isdigit(c)) 2022 { 2023 anyexp = true; 2024 c = *p++; 2025 continue; 2026 } 2027 if (c == '_') 2028 { 2029 c = *p++; 2030 continue; 2031 } 2032 if (!anyexp) 2033 error("missing exponent"); 2034 break; 2035 } 2036 } 2037 else if (hex) 2038 error("exponent required for hex float"); 2039 --p; 2040 while (pstart < p) 2041 { 2042 if (*pstart != '_') 2043 stringbuffer.writeByte(*pstart); 2044 ++pstart; 2045 } 2046 stringbuffer.writeByte(0); 2047 TOK result; 2048 t.float80value = Port.strtold(cast(char*)stringbuffer.data, null); 2049 errno = 0; 2050 switch (*p) 2051 { 2052 case 'F': 2053 case 'f': 2054 // Only interested in errno return 2055 cast(void)Port.strtof(cast(char*)stringbuffer.data, null); 2056 result = TOKfloat32v; 2057 p++; 2058 break; 2059 default: 2060 /* Should do our own strtod(), since dmc and linux gcc 2061 * accept 2.22507e-308, while apple gcc will only take 2062 * 2.22508e-308. Not sure who is right. 2063 */ 2064 // Only interested in errno return 2065 cast(void)Port.strtod(cast(char*)stringbuffer.data, null); 2066 result = TOKfloat64v; 2067 break; 2068 case 'l': 2069 error("use 'L' suffix instead of 'l'"); 2070 case 'L': 2071 result = TOKfloat80v; 2072 p++; 2073 break; 2074 } 2075 if (*p == 'i' || *p == 'I') 2076 { 2077 if (*p == 'I') 2078 error("use 'i' suffix instead of 'I'"); 2079 p++; 2080 switch (result) 2081 { 2082 case TOKfloat32v: 2083 result = TOKimaginary32v; 2084 break; 2085 case TOKfloat64v: 2086 result = TOKimaginary64v; 2087 break; 2088 case TOKfloat80v: 2089 result = TOKimaginary80v; 2090 break; 2091 default: 2092 break; 2093 } 2094 } 2095 if (errno == ERANGE) 2096 { 2097 const(char)* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; 2098 error(scanloc, "number '%s%s' is not representable", cast(char*)stringbuffer.data, suffix); 2099 } 2100 debug 2101 { 2102 switch (result) 2103 { 2104 case TOKfloat32v: 2105 case TOKfloat64v: 2106 case TOKfloat80v: 2107 case TOKimaginary32v: 2108 case TOKimaginary64v: 2109 case TOKimaginary80v: 2110 break; 2111 default: 2112 assert(0); 2113 } 2114 } 2115 return result; 2116 } 2117 2118 final Loc loc() 2119 { 2120 scanloc.charnum = cast(uint)(1 + p - line); 2121 return scanloc; 2122 } 2123 2124 final void error(const(char)* format, ...) 2125 { 2126 va_list ap; 2127 va_start(ap, format); 2128 .verror(token.loc, format, ap); 2129 va_end(ap); 2130 errors = true; 2131 } 2132 2133 final void error(Loc loc, const(char)* format, ...) 2134 { 2135 va_list ap; 2136 va_start(ap, format); 2137 .verror(loc, format, ap); 2138 va_end(ap); 2139 errors = true; 2140 } 2141 2142 final void deprecation(const(char)* format, ...) 2143 { 2144 va_list ap; 2145 va_start(ap, format); 2146 .vdeprecation(token.loc, format, ap); 2147 va_end(ap); 2148 if (global.params.useDeprecated == 0) 2149 errors = true; 2150 } 2151 2152 /********************************************* 2153 * parse: 2154 * #line linnum [filespec] 2155 * also allow __LINE__ for linnum, and __FILE__ for filespec 2156 */ 2157 final void poundLine() 2158 { 2159 Token tok; 2160 int linnum = this.scanloc.linnum; 2161 char* filespec = null; 2162 Loc loc = this.loc(); 2163 scan(&tok); 2164 if (tok.value == TOKint32v || tok.value == TOKint64v) 2165 { 2166 int lin = cast(int)(tok.uns64value - 1); 2167 if (lin != tok.uns64value - 1) 2168 error("line number %lld out of range", cast(ulong)tok.uns64value); 2169 else 2170 linnum = lin; 2171 } 2172 else if (tok.value == TOKline) 2173 { 2174 } 2175 else 2176 goto Lerr; 2177 while (1) 2178 { 2179 switch (*p) 2180 { 2181 case 0: 2182 case 0x1A: 2183 case '\n': 2184 Lnewline: 2185 this.scanloc.linnum = linnum; 2186 if (filespec) 2187 this.scanloc.filename = filespec; 2188 return; 2189 case '\r': 2190 p++; 2191 if (*p != '\n') 2192 { 2193 p--; 2194 goto Lnewline; 2195 } 2196 continue; 2197 case ' ': 2198 case '\t': 2199 case '\v': 2200 case '\f': 2201 p++; 2202 continue; 2203 // skip white space 2204 case '_': 2205 if (memcmp(p, cast(char*)"__FILE__", 8) == 0) 2206 { 2207 p += 8; 2208 filespec = mem.xstrdup(scanloc.filename); 2209 continue; 2210 } 2211 goto Lerr; 2212 case '"': 2213 if (filespec) 2214 goto Lerr; 2215 stringbuffer.reset(); 2216 p++; 2217 while (1) 2218 { 2219 uint c; 2220 c = *p; 2221 switch (c) 2222 { 2223 case '\n': 2224 case '\r': 2225 case 0: 2226 case 0x1A: 2227 goto Lerr; 2228 case '"': 2229 stringbuffer.writeByte(0); 2230 filespec = mem.xstrdup(cast(char*)stringbuffer.data); 2231 p++; 2232 break; 2233 default: 2234 if (c & 0x80) 2235 { 2236 uint u = decodeUTF(); 2237 if (u == PS || u == LS) 2238 goto Lerr; 2239 } 2240 stringbuffer.writeByte(c); 2241 p++; 2242 continue; 2243 } 2244 break; 2245 } 2246 continue; 2247 default: 2248 if (*p & 0x80) 2249 { 2250 uint u = decodeUTF(); 2251 if (u == PS || u == LS) 2252 goto Lnewline; 2253 } 2254 goto Lerr; 2255 } 2256 } 2257 Lerr: 2258 error(loc, "#line integer [\"filespec\"]\\n expected"); 2259 } 2260 2261 /******************************************** 2262 * Decode UTF character. 2263 * Issue error messages for invalid sequences. 2264 * Return decoded character, advance p to last character in UTF sequence. 2265 */ 2266 final uint decodeUTF() 2267 { 2268 dchar_t u; 2269 char c; 2270 const(char)* s = p; 2271 size_t len; 2272 size_t idx; 2273 const(char)* msg; 2274 c = *s; 2275 assert(c & 0x80); 2276 // Check length of remaining string up to 6 UTF-8 characters 2277 for (len = 1; len < 6 && s[len]; len++) 2278 { 2279 } 2280 idx = 0; 2281 msg = utf_decodeChar(s, len, &idx, &u); 2282 p += idx - 1; 2283 if (msg) 2284 { 2285 error("%s", msg); 2286 } 2287 return u; 2288 } 2289 2290 /*************************************************** 2291 * Parse doc comment embedded between t->ptr and p. 2292 * Remove trailing blanks and tabs from lines. 2293 * Replace all newlines with \n. 2294 * Remove leading comment character from each line. 2295 * Decide if it's a lineComment or a blockComment. 2296 * Append to previous one for this token. 2297 */ 2298 final void getDocComment(Token* t, uint lineComment) 2299 { 2300 /* ct tells us which kind of comment it is: '/', '*', or '+' 2301 */ 2302 char ct = t.ptr[2]; 2303 /* Start of comment text skips over / * *, / + +, or / / / 2304 */ 2305 const(char)* q = t.ptr + 3; // start of comment text 2306 const(char)* qend = p; 2307 if (ct == '*' || ct == '+') 2308 qend -= 2; 2309 /* Scan over initial row of ****'s or ++++'s or ////'s 2310 */ 2311 for (; q < qend; q++) 2312 { 2313 if (*q != ct) 2314 break; 2315 } 2316 /* Remove leading spaces until start of the comment 2317 */ 2318 int linestart = 0; 2319 if (ct == '/') 2320 { 2321 while (q < qend && (*q == ' ' || *q == '\t')) 2322 ++q; 2323 } 2324 else if (q < qend) 2325 { 2326 if (*q == '\r') 2327 { 2328 ++q; 2329 if (q < qend && *q == '\n') 2330 ++q; 2331 linestart = 1; 2332 } 2333 else if (*q == '\n') 2334 { 2335 ++q; 2336 linestart = 1; 2337 } 2338 } 2339 /* Remove trailing row of ****'s or ++++'s 2340 */ 2341 if (ct != '/') 2342 { 2343 for (; q < qend; qend--) 2344 { 2345 if (qend[-1] != ct) 2346 break; 2347 } 2348 } 2349 /* Comment is now [q .. qend]. 2350 * Canonicalize it into buf[]. 2351 */ 2352 OutBuffer buf; 2353 for (; q < qend; q++) 2354 { 2355 char c = *q; 2356 switch (c) 2357 { 2358 case '*': 2359 case '+': 2360 if (linestart && c == ct) 2361 { 2362 linestart = 0; 2363 /* Trim preceding whitespace up to preceding \n 2364 */ 2365 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2366 buf.offset--; 2367 continue; 2368 } 2369 break; 2370 case ' ': 2371 case '\t': 2372 break; 2373 case '\r': 2374 if (q[1] == '\n') 2375 continue; 2376 // skip the \r 2377 goto Lnewline; 2378 default: 2379 if (c == 226) 2380 { 2381 // If LS or PS 2382 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2383 { 2384 q += 2; 2385 goto Lnewline; 2386 } 2387 } 2388 linestart = 0; 2389 break; 2390 Lnewline: 2391 c = '\n'; // replace all newlines with \n 2392 case '\n': 2393 linestart = 1; 2394 /* Trim trailing whitespace 2395 */ 2396 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2397 buf.offset--; 2398 break; 2399 } 2400 buf.writeByte(c); 2401 } 2402 /* Trim trailing whitespace (if the last line does not have newline) 2403 */ 2404 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2405 { 2406 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2407 buf.offset--; 2408 } 2409 // Always end with a newline 2410 if (!buf.offset || buf.data[buf.offset - 1] != '\n') 2411 buf.writeByte('\n'); 2412 buf.writeByte(0); 2413 // It's a line comment if the start of the doc comment comes 2414 // after other non-whitespace on the same line. 2415 const(char)** dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2416 // Combine with previous doc comment, if any 2417 if (*dc) 2418 *dc = combineComments(*dc, cast(char*)buf.data); 2419 else 2420 *dc = cast(char*)buf.extractData(); 2421 } 2422 2423 /********************************** 2424 * Determine if string is a valid Identifier. 2425 * Placed here because of commonality with Lexer functionality. 2426 * Returns: 2427 * 0 invalid 2428 */ 2429 final static bool isValidIdentifier(const(char)* p) 2430 { 2431 size_t len; 2432 size_t idx; 2433 if (!p || !*p) 2434 goto Linvalid; 2435 if (*p >= '0' && *p <= '9') // beware of isdigit() on signed chars 2436 goto Linvalid; 2437 len = strlen(p); 2438 idx = 0; 2439 while (p[idx]) 2440 { 2441 dchar_t dc; 2442 const(char)* q = utf_decodeChar(cast(char*)p, len, &idx, &dc); 2443 if (q) 2444 goto Linvalid; 2445 if (!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_')) 2446 goto Linvalid; 2447 } 2448 return true; 2449 Linvalid: 2450 return false; 2451 } 2452 2453 /******************************************** 2454 * Combine two document comments into one, 2455 * separated by a newline. 2456 */ 2457 final static const(char)* combineComments(const(char)* c1, const(char)* c2) 2458 { 2459 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); 2460 const(char)* c = c2; 2461 if (c1) 2462 { 2463 c = c1; 2464 if (c2) 2465 { 2466 size_t len1 = strlen(cast(char*)c1); 2467 size_t len2 = strlen(cast(char*)c2); 2468 int insertNewLine = 0; 2469 if (len1 && c1[len1 - 1] != '\n') 2470 { 2471 ++len1; 2472 insertNewLine = 1; 2473 } 2474 char* p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1); 2475 memcpy(p, c1, len1 - insertNewLine); 2476 if (insertNewLine) 2477 p[len1 - 1] = '\n'; 2478 p[len1] = '\n'; 2479 memcpy(p + len1 + 1, c2, len2); 2480 p[len1 + 1 + len2] = 0; 2481 c = p; 2482 } 2483 } 2484 return c; 2485 } 2486 2487 private: 2488 final void endOfLine() 2489 { 2490 scanloc.linnum++; 2491 line = p; 2492 } 2493 }