1 // Compiler implementation of the D programming language 2 // Copyright (c) 1999-2015 by Digital Mars 3 // All Rights Reserved 4 // written by Walter Bright 5 // http://www.digitalmars.com 6 // Distributed under the Boost Software License, Version 1.0. 7 // http://www.boost.org/LICENSE_1_0.txt 8 9 module ddmd.lexer; 10 11 import core.stdc.ctype; 12 import core.stdc.errno; 13 import core.stdc.stdarg; 14 import core.stdc.stdio; 15 import core.stdc..string; 16 import core.stdc.time; 17 18 import ddmd.entity; 19 import ddmd.errors; 20 import ddmd.globals; 21 import ddmd.id; 22 import ddmd.identifier; 23 import ddmd.root.longdouble; 24 import ddmd.root.outbuffer; 25 import ddmd.root.port; 26 import ddmd.root.rmem; 27 import ddmd.root.stringtable; 28 import ddmd.tokens; 29 import ddmd.utf; 30 31 enum LS = 0x2028; // UTF line separator 32 enum PS = 0x2029; // UTF paragraph separator 33 34 /******************************************** 35 * Do our own char maps 36 */ 37 immutable ubyte[256] cmtable; 38 enum CMoctal = 0x1; 39 enum CMhex = 0x2; 40 enum CMidchar = 0x4; 41 enum CMzerosecond = 0x8; 42 enum CMdigitsecond = 0x10; 43 enum CMsinglechar = 0x20; 44 45 bool isoctal(char c) 46 { 47 return (cmtable[c] & CMoctal) != 0; 48 } 49 50 bool ishex(char c) 51 { 52 return (cmtable[c] & CMhex) != 0; 53 } 54 55 bool isidchar(char c) 56 { 57 return (cmtable[c] & CMidchar) != 0; 58 } 59 60 bool isZeroSecond(char c) 61 { 62 return (cmtable[c] & CMzerosecond) != 0; 63 } 64 65 bool isDigitSecond(char c) 66 { 67 return (cmtable[c] & CMdigitsecond) != 0; 68 } 69 70 bool issinglechar(char c) 71 { 72 return (cmtable[c] & CMsinglechar) != 0; 73 } 74 75 static this() 76 { 77 foreach (const c; 0 .. cmtable.length) 78 { 79 if ('0' <= c && c <= '7') 80 cmtable[c] |= CMoctal; 81 if (isxdigit(c)) 82 cmtable[c] |= CMhex; 83 if (isalnum(c) || c == '_') 84 cmtable[c] |= CMidchar; 85 86 switch (c) 87 { 88 case 'x': case 'X': 89 case 'b': case 'B': 90 cmtable[c] |= CMzerosecond; 91 break; 92 93 case '0': .. case '9': 94 case 'e': case 'E': 95 case 'f': case 'F': 96 case 'l': case 'L': 97 case 'p': case 'P': 98 case 'u': case 'U': 99 case 'i': 100 case '.': 101 case '_': 102 cmtable[c] |= CMzerosecond | CMdigitsecond; 103 break; 104 105 default: 106 break; 107 } 108 109 switch (c) 110 { 111 case '\\': 112 case '\n': 113 case '\r': 114 case 0: 115 case 0x1A: 116 case '\'': 117 break; 118 default: 119 if (!(c & 0x80)) 120 cmtable[c] |= CMsinglechar; 121 break; 122 } 123 } 124 } 125 126 unittest 127 { 128 //printf("lexer.unittest\n"); 129 /* Not much here, just trying things out. 130 */ 131 string text = "int"; 132 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0); 133 TOK tok; 134 tok = lex1.nextToken(); 135 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOKint32); 136 assert(tok == TOKint32); 137 tok = lex1.nextToken(); 138 assert(tok == TOKeof); 139 tok = lex1.nextToken(); 140 assert(tok == TOKeof); 141 } 142 143 /*********************************************************** 144 */ 145 class Lexer 146 { 147 public: 148 __gshared OutBuffer stringbuffer; 149 150 Loc scanloc; // for error messages 151 152 const(char)* base; // pointer to start of buffer 153 const(char)* end; // past end of buffer 154 const(char)* p; // current character 155 const(char)* line; // start of current line 156 Token token; 157 bool doDocComment; // collect doc comment information 158 bool anyToken; // seen at least one token 159 bool commentToken; // comments are TOKcomment's 160 bool errors; // errors occurred during lexing or parsing 161 162 /********************* 163 * Creates a Lexer. 164 * Params: 165 * filename = used for error messages 166 * base = source code, ending in a 0 byte 167 * begoffset = starting offset into base[] 168 * endoffset = last offset into base[] 169 * doDocComment = handle documentation comments 170 * commentToken = comments become TOKcomment's 171 */ 172 this(const(char)* filename, const(char)* base, size_t begoffset, size_t endoffset, bool doDocComment, bool commentToken) 173 { 174 scanloc = Loc(filename, 1, 1); 175 //printf("Lexer::Lexer(%p,%d)\n",base,length); 176 //printf("lexer.filename = %s\n", filename); 177 token = Token.init; 178 this.base = base; 179 this.end = base + endoffset; 180 p = base + begoffset; 181 line = p; 182 this.doDocComment = doDocComment; 183 this.commentToken = commentToken; 184 //initKeywords(); 185 /* If first line starts with '#!', ignore the line 186 */ 187 if (p[0] == '#' && p[1] == '!') 188 { 189 p += 2; 190 while (1) 191 { 192 char c = *p; 193 switch (c) 194 { 195 case '\n': 196 p++; 197 break; 198 case '\r': 199 p++; 200 if (*p == '\n') 201 p++; 202 break; 203 case 0: 204 case 0x1A: 205 break; 206 default: 207 if (c & 0x80) 208 { 209 uint u = decodeUTF(); 210 if (u == PS || u == LS) 211 break; 212 } 213 p++; 214 continue; 215 } 216 break; 217 } 218 endOfLine(); 219 } 220 } 221 222 final TOK nextToken() 223 { 224 if (token.next) 225 { 226 Token* t = token.next; 227 memcpy(&token, t, Token.sizeof); 228 t.free(); 229 } 230 else 231 { 232 scan(&token); 233 } 234 //token.print(); 235 return token.value; 236 } 237 238 /*********************** 239 * Look ahead at next token's value. 240 */ 241 final TOK peekNext() 242 { 243 return peek(&token).value; 244 } 245 246 /*********************** 247 * Look 2 tokens ahead at value. 248 */ 249 final TOK peekNext2() 250 { 251 Token* t = peek(&token); 252 return peek(t).value; 253 } 254 255 /**************************** 256 * Turn next token in buffer into a token. 257 */ 258 final void scan(Token* t) 259 { 260 const lastLine = scanloc.linnum; 261 Loc startLoc; 262 t.blockComment = null; 263 t.lineComment = null; 264 while (1) 265 { 266 t.ptr = p; 267 //printf("p = %p, *p = '%c'\n",p,*p); 268 t.loc = loc(); 269 switch (*p) 270 { 271 case 0: 272 case 0x1A: 273 t.value = TOKeof; // end of file 274 return; 275 case ' ': 276 case '\t': 277 case '\v': 278 case '\f': 279 p++; 280 continue; 281 // skip white space 282 case '\r': 283 p++; 284 if (*p != '\n') // if CR stands by itself 285 endOfLine(); 286 continue; 287 // skip white space 288 case '\n': 289 p++; 290 endOfLine(); 291 continue; 292 // skip white space 293 case '0': 294 if (!isZeroSecond(p[1])) // if numeric literal does not continue 295 { 296 ++p; 297 t.uns64value = 0; 298 t.value = TOKint32v; 299 return; 300 } 301 goto Lnumber; 302 303 case '1': .. case '9': 304 if (!isDigitSecond(p[1])) // if numeric literal does not continue 305 { 306 t.uns64value = *p - '0'; 307 ++p; 308 t.value = TOKint32v; 309 return; 310 } 311 Lnumber: 312 t.value = number(t); 313 return; 314 315 case '\'': 316 if (issinglechar(p[1]) && p[2] == '\'') 317 { 318 t.uns64value = p[1]; // simple one character literal 319 t.value = TOKcharv; 320 p += 3; 321 } 322 else 323 t.value = charConstant(t); 324 return; 325 case 'r': 326 if (p[1] != '"') 327 goto case_ident; 328 p++; 329 goto case '`'; 330 case '`': 331 t.value = wysiwygStringConstant(t, *p); 332 return; 333 case 'x': 334 if (p[1] != '"') 335 goto case_ident; 336 p++; 337 t.value = hexStringConstant(t); 338 return; 339 case 'q': 340 if (p[1] == '"') 341 { 342 p++; 343 t.value = delimitedStringConstant(t); 344 return; 345 } 346 else if (p[1] == '{') 347 { 348 p++; 349 t.value = tokenStringConstant(t); 350 return; 351 } 352 else 353 goto case_ident; 354 case '"': 355 t.value = escapeStringConstant(t, 0); 356 return; 357 case 'a': 358 case 'b': 359 case 'c': 360 case 'd': 361 case 'e': 362 case 'f': 363 case 'g': 364 case 'h': 365 case 'i': 366 case 'j': 367 case 'k': 368 case 'l': 369 case 'm': 370 case 'n': 371 case 'o': 372 case 'p': 373 /*case 'q': case 'r':*/ 374 case 's': 375 case 't': 376 case 'u': 377 case 'v': 378 case 'w': 379 /*case 'x':*/ 380 case 'y': 381 case 'z': 382 case 'A': 383 case 'B': 384 case 'C': 385 case 'D': 386 case 'E': 387 case 'F': 388 case 'G': 389 case 'H': 390 case 'I': 391 case 'J': 392 case 'K': 393 case 'L': 394 case 'M': 395 case 'N': 396 case 'O': 397 case 'P': 398 case 'Q': 399 case 'R': 400 case 'S': 401 case 'T': 402 case 'U': 403 case 'V': 404 case 'W': 405 case 'X': 406 case 'Y': 407 case 'Z': 408 case '_': 409 case_ident: 410 { 411 while (1) 412 { 413 const c = *++p; 414 if (isidchar(c)) 415 continue; 416 else if (c & 0x80) 417 { 418 const s = p; 419 const u = decodeUTF(); 420 if (isUniAlpha(u)) 421 continue; 422 error("char 0x%04x not allowed in identifier", u); 423 p = s; 424 } 425 break; 426 } 427 Identifier id = Identifier.idPool(cast(char*)t.ptr, p - t.ptr); 428 t.ident = id; 429 t.value = cast(TOK)id.value; 430 anyToken = 1; 431 if (*t.ptr == '_') // if special identifier token 432 { 433 __gshared bool initdone = false; 434 __gshared char[11 + 1] date; 435 __gshared char[8 + 1] time; 436 __gshared char[24 + 1] timestamp; 437 if (!initdone) // lazy evaluation 438 { 439 initdone = true; 440 time_t ct; 441 .time(&ct); 442 const p = ctime(&ct); 443 assert(p); 444 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 445 sprintf(&time[0], "%.8s", p + 11); 446 sprintf(×tamp[0], "%.24s", p); 447 } 448 if (id == Id.DATE) 449 { 450 t.ustring = date.ptr; 451 goto Lstr; 452 } 453 else if (id == Id.TIME) 454 { 455 t.ustring = time.ptr; 456 goto Lstr; 457 } 458 else if (id == Id.VENDOR) 459 { 460 t.ustring = global.compiler.vendor; 461 goto Lstr; 462 } 463 else if (id == Id.TIMESTAMP) 464 { 465 t.ustring = timestamp.ptr; 466 Lstr: 467 t.value = TOKstring; 468 t.postfix = 0; 469 t.len = cast(uint)strlen(t.ustring); 470 } 471 else if (id == Id.VERSIONX) 472 { 473 uint major = 0; 474 uint minor = 0; 475 bool point = false; 476 for (const(char)* p = global._version + 1; 1; p++) 477 { 478 const c = *p; 479 if (isdigit(cast(char)c)) 480 minor = minor * 10 + c - '0'; 481 else if (c == '.') 482 { 483 if (point) 484 break; 485 // ignore everything after second '.' 486 point = true; 487 major = minor; 488 minor = 0; 489 } 490 else 491 break; 492 } 493 t.value = TOKint64v; 494 t.uns64value = major * 1000 + minor; 495 } 496 else if (id == Id.EOFX) 497 { 498 t.value = TOKeof; 499 // Advance scanner to end of file 500 while (!(*p == 0 || *p == 0x1A)) 501 p++; 502 } 503 } 504 //printf("t->value = %d\n",t->value); 505 return; 506 } 507 case '/': 508 p++; 509 switch (*p) 510 { 511 case '=': 512 p++; 513 t.value = TOKdivass; 514 return; 515 case '*': 516 p++; 517 startLoc = loc(); 518 while (1) 519 { 520 while (1) 521 { 522 const c = *p; 523 switch (c) 524 { 525 case '/': 526 break; 527 case '\n': 528 endOfLine(); 529 p++; 530 continue; 531 case '\r': 532 p++; 533 if (*p != '\n') 534 endOfLine(); 535 continue; 536 case 0: 537 case 0x1A: 538 error("unterminated /* */ comment"); 539 p = end; 540 t.loc = loc(); 541 t.value = TOKeof; 542 return; 543 default: 544 if (c & 0x80) 545 { 546 const u = decodeUTF(); 547 if (u == PS || u == LS) 548 endOfLine(); 549 } 550 p++; 551 continue; 552 } 553 break; 554 } 555 p++; 556 if (p[-2] == '*' && p - 3 != t.ptr) 557 break; 558 } 559 if (commentToken) 560 { 561 t.loc = startLoc; 562 t.value = TOKcomment; 563 return; 564 } 565 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 566 { 567 // if /** but not /**/ 568 getDocComment(t, lastLine == startLoc.linnum); 569 } 570 continue; 571 case '/': 572 // do // style comments 573 startLoc = loc(); 574 while (1) 575 { 576 const c = *++p; 577 switch (c) 578 { 579 case '\n': 580 break; 581 case '\r': 582 if (p[1] == '\n') 583 p++; 584 break; 585 case 0: 586 case 0x1A: 587 if (commentToken) 588 { 589 p = end; 590 t.loc = startLoc; 591 t.value = TOKcomment; 592 return; 593 } 594 if (doDocComment && t.ptr[2] == '/') 595 getDocComment(t, lastLine == startLoc.linnum); 596 p = end; 597 t.loc = loc(); 598 t.value = TOKeof; 599 return; 600 default: 601 if (c & 0x80) 602 { 603 const u = decodeUTF(); 604 if (u == PS || u == LS) 605 break; 606 } 607 continue; 608 } 609 break; 610 } 611 if (commentToken) 612 { 613 p++; 614 endOfLine(); 615 t.loc = startLoc; 616 t.value = TOKcomment; 617 return; 618 } 619 if (doDocComment && t.ptr[2] == '/') 620 getDocComment(t, lastLine == startLoc.linnum); 621 p++; 622 endOfLine(); 623 continue; 624 case '+': 625 { 626 int nest; 627 startLoc = loc(); 628 p++; 629 nest = 1; 630 while (1) 631 { 632 char c = *p; 633 switch (c) 634 { 635 case '/': 636 p++; 637 if (*p == '+') 638 { 639 p++; 640 nest++; 641 } 642 continue; 643 case '+': 644 p++; 645 if (*p == '/') 646 { 647 p++; 648 if (--nest == 0) 649 break; 650 } 651 continue; 652 case '\r': 653 p++; 654 if (*p != '\n') 655 endOfLine(); 656 continue; 657 case '\n': 658 endOfLine(); 659 p++; 660 continue; 661 case 0: 662 case 0x1A: 663 error("unterminated /+ +/ comment"); 664 p = end; 665 t.loc = loc(); 666 t.value = TOKeof; 667 return; 668 default: 669 if (c & 0x80) 670 { 671 uint u = decodeUTF(); 672 if (u == PS || u == LS) 673 endOfLine(); 674 } 675 p++; 676 continue; 677 } 678 break; 679 } 680 if (commentToken) 681 { 682 t.loc = startLoc; 683 t.value = TOKcomment; 684 return; 685 } 686 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 687 { 688 // if /++ but not /++/ 689 getDocComment(t, lastLine == startLoc.linnum); 690 } 691 continue; 692 } 693 default: 694 break; 695 } 696 t.value = TOKdiv; 697 return; 698 case '.': 699 p++; 700 if (isdigit(*p)) 701 { 702 /* Note that we don't allow ._1 and ._ as being 703 * valid floating point numbers. 704 */ 705 p--; 706 t.value = inreal(t); 707 } 708 else if (p[0] == '.') 709 { 710 if (p[1] == '.') 711 { 712 p += 2; 713 t.value = TOKdotdotdot; 714 } 715 else 716 { 717 p++; 718 t.value = TOKslice; 719 } 720 } 721 else 722 t.value = TOKdot; 723 return; 724 case '&': 725 p++; 726 if (*p == '=') 727 { 728 p++; 729 t.value = TOKandass; 730 } 731 else if (*p == '&') 732 { 733 p++; 734 t.value = TOKandand; 735 } 736 else 737 t.value = TOKand; 738 return; 739 case '|': 740 p++; 741 if (*p == '=') 742 { 743 p++; 744 t.value = TOKorass; 745 } 746 else if (*p == '|') 747 { 748 p++; 749 t.value = TOKoror; 750 } 751 else 752 t.value = TOKor; 753 return; 754 case '-': 755 p++; 756 if (*p == '=') 757 { 758 p++; 759 t.value = TOKminass; 760 } 761 else if (*p == '-') 762 { 763 p++; 764 t.value = TOKminusminus; 765 } 766 else 767 t.value = TOKmin; 768 return; 769 case '+': 770 p++; 771 if (*p == '=') 772 { 773 p++; 774 t.value = TOKaddass; 775 } 776 else if (*p == '+') 777 { 778 p++; 779 t.value = TOKplusplus; 780 } 781 else 782 t.value = TOKadd; 783 return; 784 case '<': 785 p++; 786 if (*p == '=') 787 { 788 p++; 789 t.value = TOKle; // <= 790 } 791 else if (*p == '<') 792 { 793 p++; 794 if (*p == '=') 795 { 796 p++; 797 t.value = TOKshlass; // <<= 798 } 799 else 800 t.value = TOKshl; // << 801 } 802 else if (*p == '>') 803 { 804 p++; 805 if (*p == '=') 806 { 807 p++; 808 t.value = TOKleg; // <>= 809 } 810 else 811 t.value = TOKlg; // <> 812 } 813 else 814 t.value = TOKlt; // < 815 return; 816 case '>': 817 p++; 818 if (*p == '=') 819 { 820 p++; 821 t.value = TOKge; // >= 822 } 823 else if (*p == '>') 824 { 825 p++; 826 if (*p == '=') 827 { 828 p++; 829 t.value = TOKshrass; // >>= 830 } 831 else if (*p == '>') 832 { 833 p++; 834 if (*p == '=') 835 { 836 p++; 837 t.value = TOKushrass; // >>>= 838 } 839 else 840 t.value = TOKushr; // >>> 841 } 842 else 843 t.value = TOKshr; // >> 844 } 845 else 846 t.value = TOKgt; // > 847 return; 848 case '!': 849 p++; 850 if (*p == '=') 851 { 852 p++; 853 t.value = TOKnotequal; // != 854 } 855 else if (*p == '<') 856 { 857 p++; 858 if (*p == '>') 859 { 860 p++; 861 if (*p == '=') 862 { 863 p++; 864 t.value = TOKunord; // !<>= 865 } 866 else 867 t.value = TOKue; // !<> 868 } 869 else if (*p == '=') 870 { 871 p++; 872 t.value = TOKug; // !<= 873 } 874 else 875 t.value = TOKuge; // !< 876 } 877 else if (*p == '>') 878 { 879 p++; 880 if (*p == '=') 881 { 882 p++; 883 t.value = TOKul; // !>= 884 } 885 else 886 t.value = TOKule; // !> 887 } 888 else 889 t.value = TOKnot; // ! 890 return; 891 case '=': 892 p++; 893 if (*p == '=') 894 { 895 p++; 896 t.value = TOKequal; // == 897 } 898 else if (*p == '>') 899 { 900 p++; 901 t.value = TOKgoesto; // => 902 } 903 else 904 t.value = TOKassign; // = 905 return; 906 case '~': 907 p++; 908 if (*p == '=') 909 { 910 p++; 911 t.value = TOKcatass; // ~= 912 } 913 else 914 t.value = TOKtilde; // ~ 915 return; 916 case '^': 917 p++; 918 if (*p == '^') 919 { 920 p++; 921 if (*p == '=') 922 { 923 p++; 924 t.value = TOKpowass; // ^^= 925 } 926 else 927 t.value = TOKpow; // ^^ 928 } 929 else if (*p == '=') 930 { 931 p++; 932 t.value = TOKxorass; // ^= 933 } 934 else 935 t.value = TOKxor; // ^ 936 return; 937 case '(': 938 p++; 939 t.value = TOKlparen; 940 return; 941 case ')': 942 p++; 943 t.value = TOKrparen; 944 return; 945 case '[': 946 p++; 947 t.value = TOKlbracket; 948 return; 949 case ']': 950 p++; 951 t.value = TOKrbracket; 952 return; 953 case '{': 954 p++; 955 t.value = TOKlcurly; 956 return; 957 case '}': 958 p++; 959 t.value = TOKrcurly; 960 return; 961 case '?': 962 p++; 963 t.value = TOKquestion; 964 return; 965 case ',': 966 p++; 967 t.value = TOKcomma; 968 return; 969 case ';': 970 p++; 971 t.value = TOKsemicolon; 972 return; 973 case ':': 974 p++; 975 t.value = TOKcolon; 976 return; 977 case '$': 978 p++; 979 t.value = TOKdollar; 980 return; 981 case '@': 982 p++; 983 t.value = TOKat; 984 return; 985 case '*': 986 p++; 987 if (*p == '=') 988 { 989 p++; 990 t.value = TOKmulass; 991 } 992 else 993 t.value = TOKmul; 994 return; 995 case '%': 996 p++; 997 if (*p == '=') 998 { 999 p++; 1000 t.value = TOKmodass; 1001 } 1002 else 1003 t.value = TOKmod; 1004 return; 1005 case '#': 1006 { 1007 p++; 1008 Token n; 1009 scan(&n); 1010 if (n.value == TOKidentifier && n.ident == Id.line) 1011 { 1012 poundLine(); 1013 continue; 1014 } 1015 else 1016 { 1017 t.value = TOKpound; 1018 return; 1019 } 1020 } 1021 default: 1022 { 1023 dchar c = *p; 1024 if (c & 0x80) 1025 { 1026 c = decodeUTF(); 1027 // Check for start of unicode identifier 1028 if (isUniAlpha(c)) 1029 goto case_ident; 1030 if (c == PS || c == LS) 1031 { 1032 endOfLine(); 1033 p++; 1034 continue; 1035 } 1036 } 1037 if (c < 0x80 && isprint(c)) 1038 error("character '%c' is not a valid token", c); 1039 else 1040 error("character 0x%02x is not a valid token", c); 1041 p++; 1042 continue; 1043 } 1044 } 1045 } 1046 } 1047 1048 final Token* peek(Token* ct) 1049 { 1050 Token* t; 1051 if (ct.next) 1052 t = ct.next; 1053 else 1054 { 1055 t = Token.alloc(); 1056 scan(t); 1057 ct.next = t; 1058 } 1059 return t; 1060 } 1061 1062 /********************************* 1063 * tk is on the opening (. 1064 * Look ahead and return token that is past the closing ). 1065 */ 1066 final Token* peekPastParen(Token* tk) 1067 { 1068 //printf("peekPastParen()\n"); 1069 int parens = 1; 1070 int curlynest = 0; 1071 while (1) 1072 { 1073 tk = peek(tk); 1074 //tk->print(); 1075 switch (tk.value) 1076 { 1077 case TOKlparen: 1078 parens++; 1079 continue; 1080 case TOKrparen: 1081 --parens; 1082 if (parens) 1083 continue; 1084 tk = peek(tk); 1085 break; 1086 case TOKlcurly: 1087 curlynest++; 1088 continue; 1089 case TOKrcurly: 1090 if (--curlynest >= 0) 1091 continue; 1092 break; 1093 case TOKsemicolon: 1094 if (curlynest) 1095 continue; 1096 break; 1097 case TOKeof: 1098 break; 1099 default: 1100 continue; 1101 } 1102 return tk; 1103 } 1104 } 1105 1106 /******************************************* 1107 * Parse escape sequence. 1108 */ 1109 final uint escapeSequence() 1110 { 1111 uint c = *p; 1112 int ndigits; 1113 switch (c) 1114 { 1115 case '\'': 1116 case '"': 1117 case '?': 1118 case '\\': 1119 Lconsume: 1120 p++; 1121 break; 1122 case 'a': 1123 c = 7; 1124 goto Lconsume; 1125 case 'b': 1126 c = 8; 1127 goto Lconsume; 1128 case 'f': 1129 c = 12; 1130 goto Lconsume; 1131 case 'n': 1132 c = 10; 1133 goto Lconsume; 1134 case 'r': 1135 c = 13; 1136 goto Lconsume; 1137 case 't': 1138 c = 9; 1139 goto Lconsume; 1140 case 'v': 1141 c = 11; 1142 goto Lconsume; 1143 case 'u': 1144 ndigits = 4; 1145 goto Lhex; 1146 case 'U': 1147 ndigits = 8; 1148 goto Lhex; 1149 case 'x': 1150 ndigits = 2; 1151 Lhex: 1152 p++; 1153 c = *p; 1154 if (ishex(cast(char)c)) 1155 { 1156 uint v = 0; 1157 int n = 0; 1158 while (1) 1159 { 1160 if (isdigit(cast(char)c)) 1161 c -= '0'; 1162 else if (islower(c)) 1163 c -= 'a' - 10; 1164 else 1165 c -= 'A' - 10; 1166 v = v * 16 + c; 1167 c = *++p; 1168 if (++n == ndigits) 1169 break; 1170 if (!ishex(cast(char)c)) 1171 { 1172 error("escape hex sequence has %d hex digits instead of %d", n, ndigits); 1173 break; 1174 } 1175 } 1176 if (ndigits != 2 && !utf_isValidDchar(v)) 1177 { 1178 error("invalid UTF character \\U%08x", v); 1179 v = '?'; // recover with valid UTF character 1180 } 1181 c = v; 1182 } 1183 else 1184 error("undefined escape hex sequence \\%c", c); 1185 break; 1186 case '&': 1187 // named character entity 1188 for (const idstart = ++p; 1; p++) 1189 { 1190 switch (*p) 1191 { 1192 case ';': 1193 c = HtmlNamedEntity(idstart, p - idstart); 1194 if (c == ~0) 1195 { 1196 error("unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1197 c = ' '; 1198 } 1199 p++; 1200 break; 1201 default: 1202 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1203 continue; 1204 error("unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1205 break; 1206 } 1207 break; 1208 } 1209 break; 1210 case 0: 1211 case 0x1A: 1212 // end of file 1213 c = '\\'; 1214 break; 1215 default: 1216 if (isoctal(cast(char)c)) 1217 { 1218 uint v = 0; 1219 int n = 0; 1220 do 1221 { 1222 v = v * 8 + (c - '0'); 1223 c = *++p; 1224 } 1225 while (++n < 3 && isoctal(cast(char)c)); 1226 c = v; 1227 if (c > 0xFF) 1228 error("escape octal sequence \\%03o is larger than \\377", c); 1229 } 1230 else 1231 error("undefined escape sequence \\%c", c); 1232 break; 1233 } 1234 return c; 1235 } 1236 1237 /************************************** 1238 */ 1239 final TOK wysiwygStringConstant(Token* t, int tc) 1240 { 1241 Loc start = loc(); 1242 p++; 1243 stringbuffer.reset(); 1244 while (1) 1245 { 1246 dchar c = *p++; 1247 switch (c) 1248 { 1249 case '\n': 1250 endOfLine(); 1251 break; 1252 case '\r': 1253 if (*p == '\n') 1254 continue; 1255 // ignore 1256 c = '\n'; // treat EndOfLine as \n character 1257 endOfLine(); 1258 break; 1259 case 0: 1260 case 0x1A: 1261 error("unterminated string constant starting at %s", start.toChars()); 1262 t.setString(); 1263 return TOKstring; 1264 case '"': 1265 case '`': 1266 if (c == tc) 1267 { 1268 t.setString(stringbuffer); 1269 stringPostfix(t); 1270 return TOKstring; 1271 } 1272 break; 1273 default: 1274 if (c & 0x80) 1275 { 1276 p--; 1277 const u = decodeUTF(); 1278 p++; 1279 if (u == PS || u == LS) 1280 endOfLine(); 1281 stringbuffer.writeUTF8(u); 1282 continue; 1283 } 1284 break; 1285 } 1286 stringbuffer.writeByte(c); 1287 } 1288 } 1289 1290 /************************************** 1291 * Lex hex strings: 1292 * x"0A ae 34FE BD" 1293 */ 1294 final TOK hexStringConstant(Token* t) 1295 { 1296 Loc start = loc(); 1297 uint n = 0; 1298 uint v = ~0; // dead assignment, needed to suppress warning 1299 p++; 1300 stringbuffer.reset(); 1301 while (1) 1302 { 1303 dchar c = *p++; 1304 switch (c) 1305 { 1306 case ' ': 1307 case '\t': 1308 case '\v': 1309 case '\f': 1310 continue; // skip white space 1311 case '\r': 1312 if (*p == '\n') 1313 continue; // ignore '\r' if followed by '\n' 1314 // Treat isolated '\r' as if it were a '\n' 1315 goto case '\n'; 1316 case '\n': 1317 endOfLine(); 1318 continue; 1319 case 0: 1320 case 0x1A: 1321 error("unterminated string constant starting at %s", start.toChars()); 1322 t.setString(); 1323 return TOKxstring; 1324 case '"': 1325 if (n & 1) 1326 { 1327 error("odd number (%d) of hex characters in hex string", n); 1328 stringbuffer.writeByte(v); 1329 } 1330 t.setString(stringbuffer); 1331 stringPostfix(t); 1332 return TOKxstring; 1333 default: 1334 if (c >= '0' && c <= '9') 1335 c -= '0'; 1336 else if (c >= 'a' && c <= 'f') 1337 c -= 'a' - 10; 1338 else if (c >= 'A' && c <= 'F') 1339 c -= 'A' - 10; 1340 else if (c & 0x80) 1341 { 1342 p--; 1343 const u = decodeUTF(); 1344 p++; 1345 if (u == PS || u == LS) 1346 endOfLine(); 1347 else 1348 error("non-hex character \\u%04x in hex string", u); 1349 } 1350 else 1351 error("non-hex character '%c' in hex string", c); 1352 if (n & 1) 1353 { 1354 v = (v << 4) | c; 1355 stringbuffer.writeByte(v); 1356 } 1357 else 1358 v = c; 1359 n++; 1360 break; 1361 } 1362 } 1363 assert(0); // see bug 15731 1364 } 1365 1366 /************************************** 1367 * Lex delimited strings: 1368 * q"(foo(xxx))" // "foo(xxx)" 1369 * q"[foo(]" // "foo(" 1370 * q"/foo]/" // "foo]" 1371 * q"HERE 1372 * foo 1373 * HERE" // "foo\n" 1374 * Input: 1375 * p is on the " 1376 */ 1377 final TOK delimitedStringConstant(Token* t) 1378 { 1379 Loc start = loc(); 1380 dchar delimleft = 0; 1381 dchar delimright = 0; 1382 uint nest = 1; 1383 uint nestcount = ~0; // dead assignment, needed to suppress warning 1384 Identifier hereid = null; 1385 uint blankrol = 0; 1386 uint startline = 0; 1387 p++; 1388 stringbuffer.reset(); 1389 while (1) 1390 { 1391 dchar c = *p++; 1392 //printf("c = '%c'\n", c); 1393 switch (c) 1394 { 1395 case '\n': 1396 Lnextline: 1397 endOfLine(); 1398 startline = 1; 1399 if (blankrol) 1400 { 1401 blankrol = 0; 1402 continue; 1403 } 1404 if (hereid) 1405 { 1406 stringbuffer.writeUTF8(c); 1407 continue; 1408 } 1409 break; 1410 case '\r': 1411 if (*p == '\n') 1412 continue; 1413 // ignore 1414 c = '\n'; // treat EndOfLine as \n character 1415 goto Lnextline; 1416 case 0: 1417 case 0x1A: 1418 error("unterminated delimited string constant starting at %s", start.toChars()); 1419 t.setString(); 1420 return TOKstring; 1421 default: 1422 if (c & 0x80) 1423 { 1424 p--; 1425 c = decodeUTF(); 1426 p++; 1427 if (c == PS || c == LS) 1428 goto Lnextline; 1429 } 1430 break; 1431 } 1432 if (delimleft == 0) 1433 { 1434 delimleft = c; 1435 nest = 1; 1436 nestcount = 1; 1437 if (c == '(') 1438 delimright = ')'; 1439 else if (c == '{') 1440 delimright = '}'; 1441 else if (c == '[') 1442 delimright = ']'; 1443 else if (c == '<') 1444 delimright = '>'; 1445 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1446 { 1447 // Start of identifier; must be a heredoc 1448 Token tok; 1449 p--; 1450 scan(&tok); // read in heredoc identifier 1451 if (tok.value != TOKidentifier) 1452 { 1453 error("identifier expected for heredoc, not %s", tok.toChars()); 1454 delimright = c; 1455 } 1456 else 1457 { 1458 hereid = tok.ident; 1459 //printf("hereid = '%s'\n", hereid->toChars()); 1460 blankrol = 1; 1461 } 1462 nest = 0; 1463 } 1464 else 1465 { 1466 delimright = c; 1467 nest = 0; 1468 if (isspace(c)) 1469 error("delimiter cannot be whitespace"); 1470 } 1471 } 1472 else 1473 { 1474 if (blankrol) 1475 { 1476 error("heredoc rest of line should be blank"); 1477 blankrol = 0; 1478 continue; 1479 } 1480 if (nest == 1) 1481 { 1482 if (c == delimleft) 1483 nestcount++; 1484 else if (c == delimright) 1485 { 1486 nestcount--; 1487 if (nestcount == 0) 1488 goto Ldone; 1489 } 1490 } 1491 else if (c == delimright) 1492 goto Ldone; 1493 if (startline && isalpha(c) && hereid) 1494 { 1495 Token tok; 1496 auto psave = p; 1497 p--; 1498 scan(&tok); // read in possible heredoc identifier 1499 //printf("endid = '%s'\n", tok.ident->toChars()); 1500 if (tok.value == TOKidentifier && tok.ident.equals(hereid)) 1501 { 1502 /* should check that rest of line is blank 1503 */ 1504 goto Ldone; 1505 } 1506 p = psave; 1507 } 1508 stringbuffer.writeUTF8(c); 1509 startline = 0; 1510 } 1511 } 1512 Ldone: 1513 if (*p == '"') 1514 p++; 1515 else if (hereid) 1516 error("delimited string must end in %s\"", hereid.toChars()); 1517 else 1518 error("delimited string must end in %c\"", delimright); 1519 t.setString(stringbuffer); 1520 stringPostfix(t); 1521 return TOKstring; 1522 } 1523 1524 /************************************** 1525 * Lex delimited strings: 1526 * q{ foo(xxx) } // " foo(xxx) " 1527 * q{foo(} // "foo(" 1528 * q{{foo}"}"} // "{foo}"}"" 1529 * Input: 1530 * p is on the q 1531 */ 1532 final TOK tokenStringConstant(Token* t) 1533 { 1534 uint nest = 1; 1535 const start = loc(); 1536 const pstart = ++p; 1537 while (1) 1538 { 1539 Token tok; 1540 scan(&tok); 1541 switch (tok.value) 1542 { 1543 case TOKlcurly: 1544 nest++; 1545 continue; 1546 case TOKrcurly: 1547 if (--nest == 0) 1548 { 1549 t.setString(pstart, p - 1 - pstart); 1550 stringPostfix(t); 1551 return TOKstring; 1552 } 1553 continue; 1554 case TOKeof: 1555 error("unterminated token string constant starting at %s", start.toChars()); 1556 t.setString(); 1557 return TOKstring; 1558 default: 1559 continue; 1560 } 1561 } 1562 } 1563 1564 /************************************** 1565 */ 1566 final TOK escapeStringConstant(Token* t, int wide) 1567 { 1568 const start = loc(); 1569 p++; 1570 stringbuffer.reset(); 1571 while (1) 1572 { 1573 dchar c = *p++; 1574 switch (c) 1575 { 1576 case '\\': 1577 switch (*p) 1578 { 1579 case 'u': 1580 case 'U': 1581 case '&': 1582 c = escapeSequence(); 1583 stringbuffer.writeUTF8(c); 1584 continue; 1585 default: 1586 c = escapeSequence(); 1587 break; 1588 } 1589 break; 1590 case '\n': 1591 endOfLine(); 1592 break; 1593 case '\r': 1594 if (*p == '\n') 1595 continue; 1596 // ignore 1597 c = '\n'; // treat EndOfLine as \n character 1598 endOfLine(); 1599 break; 1600 case '"': 1601 t.setString(stringbuffer); 1602 stringPostfix(t); 1603 return TOKstring; 1604 case 0: 1605 case 0x1A: 1606 p--; 1607 error("unterminated string constant starting at %s", start.toChars()); 1608 t.setString(); 1609 return TOKstring; 1610 default: 1611 if (c & 0x80) 1612 { 1613 p--; 1614 c = decodeUTF(); 1615 if (c == LS || c == PS) 1616 { 1617 c = '\n'; 1618 endOfLine(); 1619 } 1620 p++; 1621 stringbuffer.writeUTF8(c); 1622 continue; 1623 } 1624 break; 1625 } 1626 stringbuffer.writeByte(c); 1627 } 1628 } 1629 1630 /************************************** 1631 */ 1632 final TOK charConstant(Token* t) 1633 { 1634 auto tk = TOKcharv; 1635 //printf("Lexer::charConstant\n"); 1636 p++; 1637 dchar c = *p++; 1638 switch (c) 1639 { 1640 case '\\': 1641 switch (*p) 1642 { 1643 case 'u': 1644 t.uns64value = escapeSequence(); 1645 tk = TOKwcharv; 1646 break; 1647 case 'U': 1648 case '&': 1649 t.uns64value = escapeSequence(); 1650 tk = TOKdcharv; 1651 break; 1652 default: 1653 t.uns64value = escapeSequence(); 1654 break; 1655 } 1656 break; 1657 case '\n': 1658 L1: 1659 endOfLine(); 1660 goto case; 1661 case '\r': 1662 case 0: 1663 case 0x1A: 1664 case '\'': 1665 error("unterminated character constant"); 1666 t.uns64value = '?'; 1667 return tk; 1668 default: 1669 if (c & 0x80) 1670 { 1671 p--; 1672 c = decodeUTF(); 1673 p++; 1674 if (c == LS || c == PS) 1675 goto L1; 1676 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1677 tk = TOKwcharv; 1678 else 1679 tk = TOKdcharv; 1680 } 1681 t.uns64value = c; 1682 break; 1683 } 1684 if (*p != '\'') 1685 { 1686 error("unterminated character constant"); 1687 t.uns64value = '?'; 1688 return tk; 1689 } 1690 p++; 1691 return tk; 1692 } 1693 1694 /*************************************** 1695 * Get postfix of string literal. 1696 */ 1697 final void stringPostfix(Token* t) 1698 { 1699 switch (*p) 1700 { 1701 case 'c': 1702 case 'w': 1703 case 'd': 1704 t.postfix = *p; 1705 p++; 1706 break; 1707 default: 1708 t.postfix = 0; 1709 break; 1710 } 1711 } 1712 1713 /************************************** 1714 * Read in a number. 1715 * If it's an integer, store it in tok.TKutok.Vlong. 1716 * integers can be decimal, octal or hex 1717 * Handle the suffixes U, UL, LU, L, etc. 1718 * If it's double, store it in tok.TKutok.Vdouble. 1719 * Returns: 1720 * TKnum 1721 * TKdouble,... 1722 */ 1723 final TOK number(Token* t) 1724 { 1725 int base = 10; 1726 const start = p; 1727 uinteger_t n = 0; // unsigned >=64 bit integer type 1728 int d; 1729 bool err = false; 1730 bool overflow = false; 1731 dchar c = *p; 1732 if (c == '0') 1733 { 1734 ++p; 1735 c = *p; 1736 switch (c) 1737 { 1738 case '0': 1739 case '1': 1740 case '2': 1741 case '3': 1742 case '4': 1743 case '5': 1744 case '6': 1745 case '7': 1746 n = c - '0'; 1747 ++p; 1748 base = 8; 1749 break; 1750 case 'x': 1751 case 'X': 1752 ++p; 1753 base = 16; 1754 break; 1755 case 'b': 1756 case 'B': 1757 ++p; 1758 base = 2; 1759 break; 1760 case '.': 1761 if (p[1] == '.') 1762 goto Ldone; 1763 // if ".." 1764 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1765 goto Ldone; 1766 // if ".identifier" or ".unicode" 1767 goto Lreal; 1768 // '.' is part of current token 1769 case 'i': 1770 case 'f': 1771 case 'F': 1772 goto Lreal; 1773 case '_': 1774 ++p; 1775 base = 8; 1776 break; 1777 case 'L': 1778 if (p[1] == 'i') 1779 goto Lreal; 1780 break; 1781 default: 1782 break; 1783 } 1784 } 1785 while (1) 1786 { 1787 c = *p; 1788 switch (c) 1789 { 1790 case '0': 1791 case '1': 1792 ++p; 1793 d = c - '0'; 1794 break; 1795 case '2': 1796 case '3': 1797 case '4': 1798 case '5': 1799 case '6': 1800 case '7': 1801 if (base == 2 && !err) 1802 { 1803 error("binary digit expected"); 1804 err = true; 1805 } 1806 ++p; 1807 d = c - '0'; 1808 break; 1809 case '8': 1810 case '9': 1811 ++p; 1812 if (base < 10 && !err) 1813 { 1814 error("radix %d digit expected, not '%c'", base, c); 1815 err = true; 1816 } 1817 d = c - '0'; 1818 break; 1819 case 'a': 1820 case 'b': 1821 case 'c': 1822 case 'd': 1823 case 'e': 1824 case 'f': 1825 case 'A': 1826 case 'B': 1827 case 'C': 1828 case 'D': 1829 case 'E': 1830 case 'F': 1831 ++p; 1832 if (base != 16) 1833 { 1834 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1835 goto Lreal; 1836 if (!err) 1837 { 1838 error("radix %d digit expected, not '%c'", base, c); 1839 err = true; 1840 } 1841 } 1842 if (c >= 'a') 1843 d = c + 10 - 'a'; 1844 else 1845 d = c + 10 - 'A'; 1846 break; 1847 case 'L': 1848 if (p[1] == 'i') 1849 goto Lreal; 1850 goto Ldone; 1851 case '.': 1852 if (p[1] == '.') 1853 goto Ldone; 1854 // if ".." 1855 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1856 goto Ldone; 1857 // if ".identifier" or ".unicode" 1858 goto Lreal; 1859 // otherwise as part of a floating point literal 1860 case 'p': 1861 case 'P': 1862 case 'i': 1863 Lreal: 1864 p = start; 1865 return inreal(t); 1866 case '_': 1867 ++p; 1868 continue; 1869 default: 1870 goto Ldone; 1871 } 1872 // Avoid expensive overflow check if we aren't at risk of overflow 1873 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 1874 n = n * base + d; 1875 else 1876 { 1877 import core.checkedint : mulu, addu; 1878 1879 n = mulu(n, base, overflow); 1880 n = addu(n, d, overflow); 1881 } 1882 } 1883 Ldone: 1884 if (overflow && !err) 1885 { 1886 error("integer overflow"); 1887 err = true; 1888 } 1889 enum FLAGS : int 1890 { 1891 FLAGS_none = 0, 1892 FLAGS_decimal = 1, // decimal 1893 FLAGS_unsigned = 2, // u or U suffix 1894 FLAGS_long = 4, // L suffix 1895 } 1896 1897 alias FLAGS_none = FLAGS.FLAGS_none; 1898 alias FLAGS_decimal = FLAGS.FLAGS_decimal; 1899 alias FLAGS_unsigned = FLAGS.FLAGS_unsigned; 1900 alias FLAGS_long = FLAGS.FLAGS_long; 1901 1902 FLAGS flags = (base == 10) ? FLAGS_decimal : FLAGS_none; 1903 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 1904 const psuffix = p; 1905 while (1) 1906 { 1907 FLAGS f; 1908 switch (*p) 1909 { 1910 case 'U': 1911 case 'u': 1912 f = FLAGS_unsigned; 1913 goto L1; 1914 case 'l': 1915 f = FLAGS_long; 1916 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 1917 goto L1; 1918 case 'L': 1919 f = FLAGS_long; 1920 L1: 1921 p++; 1922 if ((flags & f) && !err) 1923 { 1924 error("unrecognized token"); 1925 err = true; 1926 } 1927 flags = cast(FLAGS)(flags | f); 1928 continue; 1929 default: 1930 break; 1931 } 1932 break; 1933 } 1934 if (base == 8 && n >= 8) 1935 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", n, p - psuffix, psuffix, n, p - psuffix, psuffix); 1936 TOK result; 1937 switch (flags) 1938 { 1939 case FLAGS_none: 1940 /* Octal or Hexadecimal constant. 1941 * First that fits: int, uint, long, ulong 1942 */ 1943 if (n & 0x8000000000000000L) 1944 result = TOKuns64v; 1945 else if (n & 0xFFFFFFFF00000000L) 1946 result = TOKint64v; 1947 else if (n & 0x80000000) 1948 result = TOKuns32v; 1949 else 1950 result = TOKint32v; 1951 break; 1952 case FLAGS_decimal: 1953 /* First that fits: int, long, long long 1954 */ 1955 if (n & 0x8000000000000000L) 1956 { 1957 if (!err) 1958 { 1959 error("signed integer overflow"); 1960 err = true; 1961 } 1962 result = TOKuns64v; 1963 } 1964 else if (n & 0xFFFFFFFF80000000L) 1965 result = TOKint64v; 1966 else 1967 result = TOKint32v; 1968 break; 1969 case FLAGS_unsigned: 1970 case FLAGS_decimal | FLAGS_unsigned: 1971 /* First that fits: uint, ulong 1972 */ 1973 if (n & 0xFFFFFFFF00000000L) 1974 result = TOKuns64v; 1975 else 1976 result = TOKuns32v; 1977 break; 1978 case FLAGS_decimal | FLAGS_long: 1979 if (n & 0x8000000000000000L) 1980 { 1981 if (!err) 1982 { 1983 error("signed integer overflow"); 1984 err = true; 1985 } 1986 result = TOKuns64v; 1987 } 1988 else 1989 result = TOKint64v; 1990 break; 1991 case FLAGS_long: 1992 if (n & 0x8000000000000000L) 1993 result = TOKuns64v; 1994 else 1995 result = TOKint64v; 1996 break; 1997 case FLAGS_unsigned | FLAGS_long: 1998 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: 1999 result = TOKuns64v; 2000 break; 2001 default: 2002 debug 2003 { 2004 printf("%x\n", flags); 2005 } 2006 assert(0); 2007 } 2008 t.uns64value = n; 2009 return result; 2010 } 2011 2012 /************************************** 2013 * Read in characters, converting them to real. 2014 * Bugs: 2015 * Exponent overflow not detected. 2016 * Too much requested precision is not detected. 2017 */ 2018 final TOK inreal(Token* t) 2019 { 2020 //printf("Lexer::inreal()\n"); 2021 debug 2022 { 2023 assert(*p == '.' || isdigit(*p)); 2024 } 2025 stringbuffer.reset(); 2026 auto pstart = p; 2027 bool hex = false; 2028 dchar c = *p++; 2029 // Leading '0x' 2030 if (c == '0') 2031 { 2032 c = *p++; 2033 if (c == 'x' || c == 'X') 2034 { 2035 hex = true; 2036 c = *p++; 2037 } 2038 } 2039 // Digits to left of '.' 2040 while (1) 2041 { 2042 if (c == '.') 2043 { 2044 c = *p++; 2045 break; 2046 } 2047 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2048 { 2049 c = *p++; 2050 continue; 2051 } 2052 break; 2053 } 2054 // Digits to right of '.' 2055 while (1) 2056 { 2057 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2058 { 2059 c = *p++; 2060 continue; 2061 } 2062 break; 2063 } 2064 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2065 { 2066 c = *p++; 2067 if (c == '-' || c == '+') 2068 { 2069 c = *p++; 2070 } 2071 bool anyexp = false; 2072 while (1) 2073 { 2074 if (isdigit(c)) 2075 { 2076 anyexp = true; 2077 c = *p++; 2078 continue; 2079 } 2080 if (c == '_') 2081 { 2082 c = *p++; 2083 continue; 2084 } 2085 if (!anyexp) 2086 error("missing exponent"); 2087 break; 2088 } 2089 } 2090 else if (hex) 2091 error("exponent required for hex float"); 2092 --p; 2093 while (pstart < p) 2094 { 2095 if (*pstart != '_') 2096 stringbuffer.writeByte(*pstart); 2097 ++pstart; 2098 } 2099 stringbuffer.writeByte(0); 2100 auto sbufptr = cast(const(char)*)stringbuffer.data; 2101 TOK result; 2102 t.float80value = Port.strtold(sbufptr, null); 2103 errno = 0; 2104 switch (*p) 2105 { 2106 case 'F': 2107 case 'f': 2108 // Only interested in errno return 2109 cast(void)Port.strtof(sbufptr, null); 2110 result = TOKfloat32v; 2111 p++; 2112 break; 2113 default: 2114 /* Should do our own strtod(), since dmc and linux gcc 2115 * accept 2.22507e-308, while apple gcc will only take 2116 * 2.22508e-308. Not sure who is right. 2117 */ 2118 // Only interested in errno return 2119 cast(void)Port.strtod(sbufptr, null); 2120 result = TOKfloat64v; 2121 break; 2122 case 'l': 2123 error("use 'L' suffix instead of 'l'"); 2124 goto case 'L'; 2125 case 'L': 2126 result = TOKfloat80v; 2127 p++; 2128 break; 2129 } 2130 if (*p == 'i' || *p == 'I') 2131 { 2132 if (*p == 'I') 2133 error("use 'i' suffix instead of 'I'"); 2134 p++; 2135 switch (result) 2136 { 2137 case TOKfloat32v: 2138 result = TOKimaginary32v; 2139 break; 2140 case TOKfloat64v: 2141 result = TOKimaginary64v; 2142 break; 2143 case TOKfloat80v: 2144 result = TOKimaginary80v; 2145 break; 2146 default: 2147 break; 2148 } 2149 } 2150 if (errno == ERANGE) 2151 { 2152 const char* suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; 2153 error(scanloc, "number '%s%s' is not representable", sbufptr, suffix); 2154 } 2155 debug 2156 { 2157 switch (result) 2158 { 2159 case TOKfloat32v: 2160 case TOKfloat64v: 2161 case TOKfloat80v: 2162 case TOKimaginary32v: 2163 case TOKimaginary64v: 2164 case TOKimaginary80v: 2165 break; 2166 default: 2167 assert(0); 2168 } 2169 } 2170 return result; 2171 } 2172 2173 final Loc loc() 2174 { 2175 scanloc.charnum = cast(uint)(1 + p - line); 2176 return scanloc; 2177 } 2178 2179 final void error(const(char)* format, ...) 2180 { 2181 va_list ap; 2182 va_start(ap, format); 2183 .verror(token.loc, format, ap); 2184 va_end(ap); 2185 errors = true; 2186 } 2187 2188 final void error(Loc loc, const(char)* format, ...) 2189 { 2190 va_list ap; 2191 va_start(ap, format); 2192 .verror(loc, format, ap); 2193 va_end(ap); 2194 errors = true; 2195 } 2196 2197 final void deprecation(const(char)* format, ...) 2198 { 2199 va_list ap; 2200 va_start(ap, format); 2201 .vdeprecation(token.loc, format, ap); 2202 va_end(ap); 2203 if (global.params.useDeprecated == 0) 2204 errors = true; 2205 } 2206 2207 /********************************************* 2208 * parse: 2209 * #line linnum [filespec] 2210 * also allow __LINE__ for linnum, and __FILE__ for filespec 2211 */ 2212 final void poundLine() 2213 { 2214 auto linnum = this.scanloc.linnum; 2215 const(char)* filespec = null; 2216 const loc = this.loc(); 2217 Token tok; 2218 scan(&tok); 2219 if (tok.value == TOKint32v || tok.value == TOKint64v) 2220 { 2221 const lin = cast(int)(tok.uns64value - 1); 2222 if (lin != tok.uns64value - 1) 2223 error("line number %lld out of range", cast(ulong)tok.uns64value); 2224 else 2225 linnum = lin; 2226 } 2227 else if (tok.value == TOKline) 2228 { 2229 } 2230 else 2231 goto Lerr; 2232 while (1) 2233 { 2234 switch (*p) 2235 { 2236 case 0: 2237 case 0x1A: 2238 case '\n': 2239 Lnewline: 2240 this.scanloc.linnum = linnum; 2241 if (filespec) 2242 this.scanloc.filename = filespec; 2243 return; 2244 case '\r': 2245 p++; 2246 if (*p != '\n') 2247 { 2248 p--; 2249 goto Lnewline; 2250 } 2251 continue; 2252 case ' ': 2253 case '\t': 2254 case '\v': 2255 case '\f': 2256 p++; 2257 continue; 2258 // skip white space 2259 case '_': 2260 if (memcmp(p, "__FILE__".ptr, 8) == 0) 2261 { 2262 p += 8; 2263 filespec = mem.xstrdup(scanloc.filename); 2264 continue; 2265 } 2266 goto Lerr; 2267 case '"': 2268 if (filespec) 2269 goto Lerr; 2270 stringbuffer.reset(); 2271 p++; 2272 while (1) 2273 { 2274 uint c; 2275 c = *p; 2276 switch (c) 2277 { 2278 case '\n': 2279 case '\r': 2280 case 0: 2281 case 0x1A: 2282 goto Lerr; 2283 case '"': 2284 stringbuffer.writeByte(0); 2285 filespec = mem.xstrdup(cast(const(char)*)stringbuffer.data); 2286 p++; 2287 break; 2288 default: 2289 if (c & 0x80) 2290 { 2291 uint u = decodeUTF(); 2292 if (u == PS || u == LS) 2293 goto Lerr; 2294 } 2295 stringbuffer.writeByte(c); 2296 p++; 2297 continue; 2298 } 2299 break; 2300 } 2301 continue; 2302 default: 2303 if (*p & 0x80) 2304 { 2305 uint u = decodeUTF(); 2306 if (u == PS || u == LS) 2307 goto Lnewline; 2308 } 2309 goto Lerr; 2310 } 2311 } 2312 Lerr: 2313 error(loc, "#line integer [\"filespec\"]\\n expected"); 2314 } 2315 2316 /******************************************** 2317 * Decode UTF character. 2318 * Issue error messages for invalid sequences. 2319 * Return decoded character, advance p to last character in UTF sequence. 2320 */ 2321 final uint decodeUTF() 2322 { 2323 const s = p; 2324 assert(*s & 0x80); 2325 // Check length of remaining string up to 6 UTF-8 characters 2326 size_t len; 2327 for (len = 1; len < 6 && s[len]; len++) 2328 { 2329 } 2330 size_t idx = 0; 2331 dchar u; 2332 const msg = utf_decodeChar(s, len, idx, u); 2333 p += idx - 1; 2334 if (msg) 2335 { 2336 error("%s", msg); 2337 } 2338 return u; 2339 } 2340 2341 /*************************************************** 2342 * Parse doc comment embedded between t->ptr and p. 2343 * Remove trailing blanks and tabs from lines. 2344 * Replace all newlines with \n. 2345 * Remove leading comment character from each line. 2346 * Decide if it's a lineComment or a blockComment. 2347 * Append to previous one for this token. 2348 */ 2349 final void getDocComment(Token* t, uint lineComment) 2350 { 2351 /* ct tells us which kind of comment it is: '/', '*', or '+' 2352 */ 2353 const ct = t.ptr[2]; 2354 /* Start of comment text skips over / * *, / + +, or / / / 2355 */ 2356 const(char)* q = t.ptr + 3; // start of comment text 2357 const(char)* qend = p; 2358 if (ct == '*' || ct == '+') 2359 qend -= 2; 2360 /* Scan over initial row of ****'s or ++++'s or ////'s 2361 */ 2362 for (; q < qend; q++) 2363 { 2364 if (*q != ct) 2365 break; 2366 } 2367 /* Remove leading spaces until start of the comment 2368 */ 2369 int linestart = 0; 2370 if (ct == '/') 2371 { 2372 while (q < qend && (*q == ' ' || *q == '\t')) 2373 ++q; 2374 } 2375 else if (q < qend) 2376 { 2377 if (*q == '\r') 2378 { 2379 ++q; 2380 if (q < qend && *q == '\n') 2381 ++q; 2382 linestart = 1; 2383 } 2384 else if (*q == '\n') 2385 { 2386 ++q; 2387 linestart = 1; 2388 } 2389 } 2390 /* Remove trailing row of ****'s or ++++'s 2391 */ 2392 if (ct != '/') 2393 { 2394 for (; q < qend; qend--) 2395 { 2396 if (qend[-1] != ct) 2397 break; 2398 } 2399 } 2400 /* Comment is now [q .. qend]. 2401 * Canonicalize it into buf[]. 2402 */ 2403 OutBuffer buf; 2404 for (; q < qend; q++) 2405 { 2406 char c = *q; 2407 switch (c) 2408 { 2409 case '*': 2410 case '+': 2411 if (linestart && c == ct) 2412 { 2413 linestart = 0; 2414 /* Trim preceding whitespace up to preceding \n 2415 */ 2416 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2417 buf.offset--; 2418 continue; 2419 } 2420 break; 2421 case ' ': 2422 case '\t': 2423 break; 2424 case '\r': 2425 if (q[1] == '\n') 2426 continue; 2427 // skip the \r 2428 goto Lnewline; 2429 default: 2430 if (c == 226) 2431 { 2432 // If LS or PS 2433 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2434 { 2435 q += 2; 2436 goto Lnewline; 2437 } 2438 } 2439 linestart = 0; 2440 break; 2441 Lnewline: 2442 c = '\n'; // replace all newlines with \n 2443 goto case; 2444 case '\n': 2445 linestart = 1; 2446 /* Trim trailing whitespace 2447 */ 2448 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2449 buf.offset--; 2450 break; 2451 } 2452 buf.writeByte(c); 2453 } 2454 /* Trim trailing whitespace (if the last line does not have newline) 2455 */ 2456 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2457 { 2458 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) 2459 buf.offset--; 2460 } 2461 // Always end with a newline 2462 if (!buf.offset || buf.data[buf.offset - 1] != '\n') 2463 buf.writeByte('\n'); 2464 buf.writeByte(0); 2465 // It's a line comment if the start of the doc comment comes 2466 // after other non-whitespace on the same line. 2467 const(char)** dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2468 // Combine with previous doc comment, if any 2469 if (*dc) 2470 *dc = combineComments(*dc, cast(const(char)*)buf.data); 2471 else 2472 *dc = cast(const(char)*)buf.extractData(); 2473 } 2474 2475 /******************************************** 2476 * Combine two document comments into one, 2477 * separated by a newline. 2478 */ 2479 final static const(char)* combineComments(const(char)* c1, const(char)* c2) 2480 { 2481 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); 2482 auto c = c2; 2483 if (c1) 2484 { 2485 c = c1; 2486 if (c2) 2487 { 2488 size_t len1 = strlen(c1); 2489 size_t len2 = strlen(c2); 2490 int insertNewLine = 0; 2491 if (len1 && c1[len1 - 1] != '\n') 2492 { 2493 ++len1; 2494 insertNewLine = 1; 2495 } 2496 auto p = cast(char*)mem.xmalloc(len1 + 1 + len2 + 1); 2497 memcpy(p, c1, len1 - insertNewLine); 2498 if (insertNewLine) 2499 p[len1 - 1] = '\n'; 2500 p[len1] = '\n'; 2501 memcpy(p + len1 + 1, c2, len2); 2502 p[len1 + 1 + len2] = 0; 2503 c = p; 2504 } 2505 } 2506 return c; 2507 } 2508 2509 private: 2510 final void endOfLine() 2511 { 2512 scanloc.linnum++; 2513 line = p; 2514 } 2515 }