clang 22.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements FormatTokenLexer, which tokenizes a source file
11/// into a FormatToken stream suitable for ClangFormat.
12///
13//===----------------------------------------------------------------------===//
14
15#include "FormatTokenLexer.h"
16#include "FormatToken.h"
20#include "clang/Format/Format.h"
21#include "llvm/Support/Regex.h"
22
23namespace clang {
24namespace format {
25
27 const SourceManager &SourceMgr, FileID ID, unsigned Column,
28 const FormatStyle &Style, encoding::Encoding Encoding,
29 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
30 IdentifierTable &IdentTable)
31 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
32 Column(Column), TrailingWhitespace(0),
33 LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
34 Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
35 Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
36 FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
37 MacroBlockBeginRegex(Style.MacroBlockBegin),
38 MacroBlockEndRegex(Style.MacroBlockEnd) {
39 Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
40 Lex->SetKeepWhitespaceMode(true);
41
42 for (const std::string &ForEachMacro : Style.ForEachMacros) {
43 auto Identifier = &IdentTable.get(ForEachMacro);
44 Macros.insert({Identifier, TT_ForEachMacro});
45 }
46 for (const std::string &IfMacro : Style.IfMacros) {
47 auto Identifier = &IdentTable.get(IfMacro);
48 Macros.insert({Identifier, TT_IfMacro});
49 }
50 for (const std::string &AttributeMacro : Style.AttributeMacros) {
51 auto Identifier = &IdentTable.get(AttributeMacro);
52 Macros.insert({Identifier, TT_AttributeMacro});
53 }
54 for (const std::string &StatementMacro : Style.StatementMacros) {
55 auto Identifier = &IdentTable.get(StatementMacro);
56 Macros.insert({Identifier, TT_StatementMacro});
57 }
58 for (const std::string &TypenameMacro : Style.TypenameMacros) {
59 auto Identifier = &IdentTable.get(TypenameMacro);
60 Macros.insert({Identifier, TT_TypenameMacro});
61 }
62 for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
63 auto Identifier = &IdentTable.get(NamespaceMacro);
64 Macros.insert({Identifier, TT_NamespaceMacro});
65 }
66 for (const std::string &WhitespaceSensitiveMacro :
67 Style.WhitespaceSensitiveMacros) {
68 auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
69 Macros.insert({Identifier, TT_UntouchableMacroFunc});
70 }
71 for (const std::string &StatementAttributeLikeMacro :
72 Style.StatementAttributeLikeMacros) {
73 auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
74 Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
75 }
76
77 for (const auto &Macro : Style.MacrosSkippedByRemoveParentheses)
78 MacrosSkippedByRemoveParentheses.insert(&IdentTable.get(Macro));
79 for (const auto &TemplateName : Style.TemplateNames)
80 TemplateNames.insert(&IdentTable.get(TemplateName));
81 for (const auto &TypeName : Style.TypeNames)
82 TypeNames.insert(&IdentTable.get(TypeName));
83 for (const auto &VariableTemplate : Style.VariableTemplates)
84 VariableTemplates.insert(&IdentTable.get(VariableTemplate));
85}
86
88 assert(Tokens.empty());
89 assert(FirstInLineIndex == 0);
90 enum { FO_None, FO_CurrentLine, FO_NextLine } FormatOff = FO_None;
91 do {
92 Tokens.push_back(getNextToken());
93 auto &Tok = *Tokens.back();
94 const auto NewlinesBefore = Tok.NewlinesBefore;
95 switch (FormatOff) {
96 case FO_CurrentLine:
97 if (NewlinesBefore == 0)
98 Tok.Finalized = true;
99 else
100 FormatOff = FO_None;
101 break;
102 case FO_NextLine:
103 if (NewlinesBefore > 1) {
104 FormatOff = FO_None;
105 } else {
106 Tok.Finalized = true;
107 FormatOff = FO_CurrentLine;
108 }
109 break;
110 default:
111 if (!FormattingDisabled && FormatOffRegex.match(Tok.TokenText)) {
112 if (Tok.is(tok::comment) &&
113 (NewlinesBefore > 0 || Tokens.size() == 1)) {
114 Tok.Finalized = true;
115 FormatOff = FO_NextLine;
116 } else {
117 for (auto *Token : reverse(Tokens)) {
118 Token->Finalized = true;
119 if (Token->NewlinesBefore > 0)
120 break;
121 }
122 FormatOff = FO_CurrentLine;
123 }
124 }
125 }
126 if (Style.isJavaScript()) {
127 tryParseJSRegexLiteral();
128 handleTemplateStrings();
129 } else if (Style.isTextProto()) {
130 tryParsePythonComment();
131 }
132 tryMergePreviousTokens();
133 if (Style.isCSharp()) {
134 // This needs to come after tokens have been merged so that C#
135 // string literals are correctly identified.
136 handleCSharpVerbatimAndInterpolatedStrings();
137 } else if (Style.isTableGen()) {
138 handleTableGenMultilineString();
139 handleTableGenNumericLikeIdentifier();
140 }
141 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
142 FirstInLineIndex = Tokens.size() - 1;
143 } while (Tokens.back()->isNot(tok::eof));
144 if (Style.InsertNewlineAtEOF) {
145 auto &TokEOF = *Tokens.back();
146 if (TokEOF.NewlinesBefore == 0) {
147 TokEOF.NewlinesBefore = 1;
148 TokEOF.OriginalColumn = 0;
149 }
150 }
151 return Tokens;
152}
153
154void FormatTokenLexer::tryMergePreviousTokens() {
155 if (tryMerge_TMacro())
156 return;
157 if (tryMergeConflictMarkers())
158 return;
159 if (tryMergeLessLess())
160 return;
161 if (tryMergeGreaterGreater())
162 return;
163 if (tryMergeForEach())
164 return;
165 if (Style.isCpp() && tryTransformTryUsageForC())
166 return;
167
168 if ((Style.Language == FormatStyle::LK_Cpp ||
169 Style.Language == FormatStyle::LK_ObjC) &&
170 tryMergeUserDefinedLiteral()) {
171 return;
172 }
173
174 if (Style.isJavaScript() || Style.isCSharp()) {
175 static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
176 tok::question};
177 static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
178 tok::period};
179 static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
180
181 if (tryMergeTokens(FatArrow, TT_FatArrow))
182 return;
183 if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
184 // Treat like the "||" operator (as opposed to the ternary ?).
185 Tokens.back()->Tok.setKind(tok::pipepipe);
186 return;
187 }
188 if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
189 // Treat like a regular "." access.
190 Tokens.back()->Tok.setKind(tok::period);
191 return;
192 }
193 if (tryMergeNullishCoalescingEqual())
194 return;
195
196 if (Style.isCSharp()) {
197 static const tok::TokenKind CSharpNullConditionalLSquare[] = {
198 tok::question, tok::l_square};
199
200 if (tryMergeCSharpKeywordVariables())
201 return;
202 if (tryMergeCSharpStringLiteral())
203 return;
204 if (tryTransformCSharpForEach())
205 return;
206 if (tryMergeTokens(CSharpNullConditionalLSquare,
207 TT_CSharpNullConditionalLSquare)) {
208 // Treat like a regular "[" operator.
209 Tokens.back()->Tok.setKind(tok::l_square);
210 return;
211 }
212 }
213 }
214
215 if (tryMergeNSStringLiteral())
216 return;
217
218 if (Style.isJavaScript()) {
219 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
220 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
221 tok::equal};
222 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
223 tok::greaterequal};
224 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
225 static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
226 tok::starequal};
227 static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
228 static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
229
230 // FIXME: Investigate what token type gives the correct operator priority.
231 if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
232 return;
233 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
234 return;
235 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
236 return;
237 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
238 return;
239 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
240 Tokens.back()->Tok.setKind(tok::starequal);
241 return;
242 }
243 if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
244 tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
245 // Treat like the "=" assignment operator.
246 Tokens.back()->Tok.setKind(tok::equal);
247 return;
248 }
249 if (tryMergeJSPrivateIdentifier())
250 return;
251 } else if (Style.isJava()) {
252 static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
253 tok::greater, tok::greater, tok::greaterequal};
254 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
255 return;
256 } else if (Style.isVerilog()) {
257 // Merge the number following a base like `'h?a0`.
258 if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
259 Tokens.end()[-2]->is(tok::numeric_constant) &&
260 Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
261 tok::question) &&
262 tryMergeTokens(2, TT_Unknown)) {
263 return;
264 }
265 // Part select.
266 if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
267 TT_BitFieldColon)) {
268 return;
269 }
270 // Xnor. The combined token is treated as a caret which can also be either a
271 // unary or binary operator. The actual type is determined in
272 // TokenAnnotator. We also check the token length so we know it is not
273 // already a merged token.
274 if (Tokens.back()->TokenText.size() == 1 &&
275 tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
276 TT_BinaryOperator)) {
277 Tokens.back()->Tok.setKind(tok::caret);
278 return;
279 }
280 // Signed shift and distribution weight.
281 if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
282 Tokens.back()->Tok.setKind(tok::lessless);
283 return;
284 }
285 if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
286 Tokens.back()->Tok.setKind(tok::greatergreater);
287 return;
288 }
289 if (tryMergeTokensAny({{tok::lessless, tok::equal},
290 {tok::lessless, tok::lessequal},
291 {tok::greatergreater, tok::equal},
292 {tok::greatergreater, tok::greaterequal},
293 {tok::colon, tok::equal},
294 {tok::colon, tok::slash}},
295 TT_BinaryOperator)) {
296 Tokens.back()->ForcedPrecedence = prec::Assignment;
297 return;
298 }
299 // Exponentiation, signed shift, case equality, and wildcard equality.
300 if (tryMergeTokensAny({{tok::star, tok::star},
301 {tok::lessless, tok::less},
302 {tok::greatergreater, tok::greater},
303 {tok::exclaimequal, tok::equal},
304 {tok::exclaimequal, tok::question},
305 {tok::equalequal, tok::equal},
306 {tok::equalequal, tok::question}},
307 TT_BinaryOperator)) {
308 return;
309 }
310 // Module paths in specify blocks and the implication and boolean equality
311 // operators.
312 if (tryMergeTokensAny({{tok::plusequal, tok::greater},
313 {tok::plus, tok::star, tok::greater},
314 {tok::minusequal, tok::greater},
315 {tok::minus, tok::star, tok::greater},
316 {tok::less, tok::arrow},
317 {tok::equal, tok::greater},
318 {tok::star, tok::greater},
319 {tok::pipeequal, tok::greater},
320 {tok::pipe, tok::arrow},
321 {tok::hash, tok::minus, tok::hash},
322 {tok::hash, tok::equal, tok::hash}},
323 TT_BinaryOperator) ||
324 Tokens.back()->is(tok::arrow)) {
325 Tokens.back()->ForcedPrecedence = prec::Comma;
326 return;
327 }
328 } else if (Style.isTableGen()) {
329 // TableGen's Multi line string starts with [{
330 if (tryMergeTokens({tok::l_square, tok::l_brace},
331 TT_TableGenMultiLineString)) {
332 // Set again with finalizing. This must never be annotated as other types.
333 Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
334 Tokens.back()->Tok.setKind(tok::string_literal);
335 return;
336 }
337 // TableGen's bang operator is the form !<name>.
338 // !cond is a special case with specific syntax.
339 if (tryMergeTokens({tok::exclaim, tok::identifier},
340 TT_TableGenBangOperator)) {
341 Tokens.back()->Tok.setKind(tok::identifier);
342 Tokens.back()->Tok.setIdentifierInfo(nullptr);
343 if (Tokens.back()->TokenText == "!cond")
344 Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
345 else
346 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
347 return;
348 }
349 if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
350 // Here, "! if" becomes "!if". That is, ! captures if even when the space
351 // exists. That is only one possibility in TableGen's syntax.
352 Tokens.back()->Tok.setKind(tok::identifier);
353 Tokens.back()->Tok.setIdentifierInfo(nullptr);
354 Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
355 return;
356 }
357 // +, - with numbers are literals. Not unary operators.
358 if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
359 Tokens.back()->Tok.setKind(tok::numeric_constant);
360 return;
361 }
362 if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
363 Tokens.back()->Tok.setKind(tok::numeric_constant);
364 return;
365 }
366 }
367}
368
369bool FormatTokenLexer::tryMergeNSStringLiteral() {
370 if (Tokens.size() < 2)
371 return false;
372 auto &At = *(Tokens.end() - 2);
373 auto &String = *(Tokens.end() - 1);
374 if (At->isNot(tok::at) || String->isNot(tok::string_literal))
375 return false;
376 At->Tok.setKind(tok::string_literal);
377 At->TokenText = StringRef(At->TokenText.begin(),
378 String->TokenText.end() - At->TokenText.begin());
379 At->ColumnWidth += String->ColumnWidth;
380 At->setType(TT_ObjCStringLiteral);
381 Tokens.erase(Tokens.end() - 1);
382 return true;
383}
384
385bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
386 // Merges #idenfier into a single identifier with the text #identifier
387 // but the token tok::identifier.
388 if (Tokens.size() < 2)
389 return false;
390 auto &Hash = *(Tokens.end() - 2);
391 auto &Identifier = *(Tokens.end() - 1);
392 if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
393 return false;
394 Hash->Tok.setKind(tok::identifier);
395 Hash->TokenText =
396 StringRef(Hash->TokenText.begin(),
397 Identifier->TokenText.end() - Hash->TokenText.begin());
398 Hash->ColumnWidth += Identifier->ColumnWidth;
399 Hash->setType(TT_JsPrivateIdentifier);
400 Tokens.erase(Tokens.end() - 1);
401 return true;
402}
403
404// Search for verbatim or interpolated string literals @"ABC" or
405// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
406// prevent splitting of @, $ and ".
407// Merging of multiline verbatim strings with embedded '"' is handled in
408// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
409bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
410 if (Tokens.size() < 2)
411 return false;
412
413 // Look for @"aaaaaa" or $"aaaaaa".
414 const auto String = *(Tokens.end() - 1);
415 if (String->isNot(tok::string_literal))
416 return false;
417
418 auto Prefix = *(Tokens.end() - 2);
419 if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
420 return false;
421
422 if (Tokens.size() > 2) {
423 const auto Tok = *(Tokens.end() - 3);
424 if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
425 (Tok->is(tok::at) && Prefix->TokenText == "$")) {
426 // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
427 Tok->ColumnWidth += Prefix->ColumnWidth;
428 Tokens.erase(Tokens.end() - 2);
429 Prefix = Tok;
430 }
431 }
432
433 // Convert back into just a string_literal.
434 Prefix->Tok.setKind(tok::string_literal);
435 Prefix->TokenText =
436 StringRef(Prefix->TokenText.begin(),
437 String->TokenText.end() - Prefix->TokenText.begin());
438 Prefix->ColumnWidth += String->ColumnWidth;
439 Prefix->setType(TT_CSharpStringLiteral);
440 Tokens.erase(Tokens.end() - 1);
441 return true;
442}
443
444// Valid C# attribute targets:
445// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
446const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
447 "assembly", "module", "field", "event", "method",
448 "param", "property", "return", "type",
449};
450
451bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
452 if (Tokens.size() < 2)
453 return false;
454 auto &NullishCoalescing = *(Tokens.end() - 2);
455 auto &Equal = *(Tokens.end() - 1);
456 if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
457 Equal->isNot(tok::equal)) {
458 return false;
459 }
460 NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
461 NullishCoalescing->TokenText =
462 StringRef(NullishCoalescing->TokenText.begin(),
463 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
464 NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
465 NullishCoalescing->setType(TT_NullCoalescingEqual);
466 Tokens.erase(Tokens.end() - 1);
467 return true;
468}
469
470bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
471 if (Tokens.size() < 2)
472 return false;
473 const auto At = *(Tokens.end() - 2);
474 if (At->isNot(tok::at))
475 return false;
476 const auto Keyword = *(Tokens.end() - 1);
477 if (Keyword->TokenText == "$")
478 return false;
479 if (!Keywords.isCSharpKeyword(*Keyword))
480 return false;
481
482 At->Tok.setKind(tok::identifier);
483 At->TokenText = StringRef(At->TokenText.begin(),
484 Keyword->TokenText.end() - At->TokenText.begin());
485 At->ColumnWidth += Keyword->ColumnWidth;
486 At->setType(Keyword->getType());
487 Tokens.erase(Tokens.end() - 1);
488 return true;
489}
490
491// In C# transform identifier foreach into kw_foreach
492bool FormatTokenLexer::tryTransformCSharpForEach() {
493 if (Tokens.empty())
494 return false;
495 auto &Identifier = *(Tokens.end() - 1);
496 if (Identifier->isNot(tok::identifier))
497 return false;
498 if (Identifier->TokenText != "foreach")
499 return false;
500
501 Identifier->setType(TT_ForEachMacro);
502 Identifier->Tok.setKind(tok::kw_for);
503 return true;
504}
505
506bool FormatTokenLexer::tryMergeForEach() {
507 if (Tokens.size() < 2)
508 return false;
509 auto &For = *(Tokens.end() - 2);
510 auto &Each = *(Tokens.end() - 1);
511 if (For->isNot(tok::kw_for))
512 return false;
513 if (Each->isNot(tok::identifier))
514 return false;
515 if (Each->TokenText != "each")
516 return false;
517
518 For->setType(TT_ForEachMacro);
519 For->Tok.setKind(tok::kw_for);
520
521 For->TokenText = StringRef(For->TokenText.begin(),
522 Each->TokenText.end() - For->TokenText.begin());
523 For->ColumnWidth += Each->ColumnWidth;
524 Tokens.erase(Tokens.end() - 1);
525 return true;
526}
527
528bool FormatTokenLexer::tryTransformTryUsageForC() {
529 if (Tokens.size() < 2)
530 return false;
531 auto &Try = *(Tokens.end() - 2);
532 if (Try->isNot(tok::kw_try))
533 return false;
534 auto &Next = *(Tokens.end() - 1);
535 if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
536 return false;
537
538 if (Tokens.size() > 2) {
539 auto &At = *(Tokens.end() - 3);
540 if (At->is(tok::at))
541 return false;
542 }
543
544 Try->Tok.setKind(tok::identifier);
545 return true;
546}
547
548bool FormatTokenLexer::tryMergeLessLess() {
549 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
550 if (Tokens.size() < 3)
551 return false;
552
553 auto First = Tokens.end() - 3;
554 if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
555 return false;
556
557 // Only merge if there currently is no whitespace between the two "<".
558 if (First[1]->hasWhitespaceBefore())
559 return false;
560
561 auto X = Tokens.size() > 3 ? First[-1] : nullptr;
562 if (X && X->is(tok::less))
563 return false;
564
565 auto Y = First[2];
566 if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
567 return false;
568
569 First[0]->Tok.setKind(tok::lessless);
570 First[0]->TokenText = "<<";
571 First[0]->ColumnWidth += 1;
572 Tokens.erase(Tokens.end() - 2);
573 return true;
574}
575
576bool FormatTokenLexer::tryMergeGreaterGreater() {
577 // Merge kw_operator,greater,greater into kw_operator,greatergreater.
578 if (Tokens.size() < 2)
579 return false;
580
581 auto First = Tokens.end() - 2;
582 if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
583 return false;
584
585 // Only merge if there currently is no whitespace between the first two ">".
586 if (First[1]->hasWhitespaceBefore())
587 return false;
588
589 auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
590 if (Tok && Tok->isNot(tok::kw_operator))
591 return false;
592
593 First[0]->Tok.setKind(tok::greatergreater);
594 First[0]->TokenText = ">>";
595 First[0]->ColumnWidth += 1;
596 Tokens.erase(Tokens.end() - 1);
597 return true;
598}
599
600bool FormatTokenLexer::tryMergeUserDefinedLiteral() {
601 if (Tokens.size() < 2)
602 return false;
603
604 auto *First = Tokens.end() - 2;
605 auto &Suffix = First[1];
606 if (Suffix->hasWhitespaceBefore() || Suffix->TokenText != "$")
607 return false;
608
609 auto &Literal = First[0];
610 if (!Literal->Tok.isLiteral())
611 return false;
612
613 auto &Text = Literal->TokenText;
614 if (!Text.ends_with("_"))
615 return false;
616
617 Text = StringRef(Text.data(), Text.size() + 1);
618 ++Literal->ColumnWidth;
619 Tokens.erase(&Suffix);
620 return true;
621}
622
623bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
624 TokenType NewType) {
625 if (Tokens.size() < Kinds.size())
626 return false;
627
628 const auto *First = Tokens.end() - Kinds.size();
629 for (unsigned i = 0; i < Kinds.size(); ++i)
630 if (First[i]->isNot(Kinds[i]))
631 return false;
632
633 return tryMergeTokens(Kinds.size(), NewType);
634}
635
636bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
637 if (Tokens.size() < Count)
638 return false;
639
640 const auto *First = Tokens.end() - Count;
641 unsigned AddLength = 0;
642 for (size_t i = 1; i < Count; ++i) {
643 // If there is whitespace separating the token and the previous one,
644 // they should not be merged.
645 if (First[i]->hasWhitespaceBefore())
646 return false;
647 AddLength += First[i]->TokenText.size();
648 }
649
650 Tokens.resize(Tokens.size() - Count + 1);
651 First[0]->TokenText = StringRef(First[0]->TokenText.data(),
652 First[0]->TokenText.size() + AddLength);
653 First[0]->ColumnWidth += AddLength;
654 First[0]->setType(NewType);
655 return true;
656}
657
658bool FormatTokenLexer::tryMergeTokensAny(
660 return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
661 return tryMergeTokens(Kinds, NewType);
662 });
663}
664
665// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
666bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
667 // NB: This is not entirely correct, as an r_paren can introduce an operand
668 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
669 // corner case to not matter in practice, though.
670 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
671 tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
672 tok::colon, tok::question, tok::tilde) ||
673 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
674 tok::kw_else, tok::kw_void, tok::kw_typeof,
675 Keywords.kw_instanceof, Keywords.kw_in) ||
676 Tok->isPlacementOperator() || Tok->isBinaryOperator();
677}
678
679bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
680 if (!Prev)
681 return true;
682
683 // Regex literals can only follow after prefix unary operators, not after
684 // postfix unary operators. If the '++' is followed by a non-operand
685 // introducing token, the slash here is the operand and not the start of a
686 // regex.
687 // `!` is an unary prefix operator, but also a post-fix operator that casts
688 // away nullability, so the same check applies.
689 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
690 return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
691
692 // The previous token must introduce an operand location where regex
693 // literals can occur.
694 if (!precedesOperand(Prev))
695 return false;
696
697 return true;
698}
699
700void FormatTokenLexer::tryParseJavaTextBlock() {
701 if (FormatTok->TokenText != "\"\"")
702 return;
703
704 const auto *S = Lex->getBufferLocation();
705 const auto *End = Lex->getBuffer().end();
706
707 if (S == End || *S != '\"')
708 return;
709
710 ++S; // Skip the `"""` that begins a text block.
711
712 // Find the `"""` that ends the text block.
713 for (int Count = 0; Count < 3 && S < End; ++S) {
714 switch (*S) {
715 case '\\':
716 Count = -1;
717 break;
718 case '\"':
719 ++Count;
720 break;
721 default:
722 Count = 0;
723 }
724 }
725
726 // Ignore the possibly invalid text block.
727 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(S)));
728}
729
730// Tries to parse a JavaScript Regex literal starting at the current token,
731// if that begins with a slash and is in a location where JavaScript allows
732// regex literals. Changes the current token to a regex literal and updates
733// its text if successful.
734void FormatTokenLexer::tryParseJSRegexLiteral() {
735 FormatToken *RegexToken = Tokens.back();
736 if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
737 return;
738
739 FormatToken *Prev = nullptr;
740 for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
741 // NB: Because previous pointers are not initialized yet, this cannot use
742 // Token.getPreviousNonComment.
743 if (FT->isNot(tok::comment)) {
744 Prev = FT;
745 break;
746 }
747 }
748
749 if (!canPrecedeRegexLiteral(Prev))
750 return;
751
752 // 'Manually' lex ahead in the current file buffer.
753 const char *Offset = Lex->getBufferLocation();
754 const char *RegexBegin = Offset - RegexToken->TokenText.size();
755 StringRef Buffer = Lex->getBuffer();
756 bool InCharacterClass = false;
757 bool HaveClosingSlash = false;
758 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
759 // Regular expressions are terminated with a '/', which can only be
760 // escaped using '\' or a character class between '[' and ']'.
761 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
762 switch (*Offset) {
763 case '\\':
764 // Skip the escaped character.
765 ++Offset;
766 break;
767 case '[':
768 InCharacterClass = true;
769 break;
770 case ']':
771 InCharacterClass = false;
772 break;
773 case '/':
774 if (!InCharacterClass)
775 HaveClosingSlash = true;
776 break;
777 }
778 }
779
780 RegexToken->setType(TT_RegexLiteral);
781 // Treat regex literals like other string_literals.
782 RegexToken->Tok.setKind(tok::string_literal);
783 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
784 RegexToken->ColumnWidth = RegexToken->TokenText.size();
785
786 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
787}
788
789static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
790 bool Interpolated) {
791 auto Repeated = [&Begin, End]() {
792 return Begin + 1 < End && Begin[1] == Begin[0];
793 };
794
795 // Look for a terminating '"' in the current file buffer.
796 // Make no effort to format code within an interpolated or verbatim string.
797 //
798 // Interpolated strings could contain { } with " characters inside.
799 // $"{x ?? "null"}"
800 // should not be split into $"{x ?? ", null, "}" but should be treated as a
801 // single string-literal.
802 //
803 // We opt not to try and format expressions inside {} within a C#
804 // interpolated string. Formatting expressions within an interpolated string
805 // would require similar work as that done for JavaScript template strings
806 // in `handleTemplateStrings()`.
807 for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
808 switch (*Begin) {
809 case '\\':
810 if (!Verbatim)
811 ++Begin;
812 break;
813 case '{':
814 if (Interpolated) {
815 // {{ inside an interpolated string is escaped, so skip it.
816 if (Repeated())
817 ++Begin;
818 else
819 ++UnmatchedOpeningBraceCount;
820 }
821 break;
822 case '}':
823 if (Interpolated) {
824 // }} inside an interpolated string is escaped, so skip it.
825 if (Repeated())
826 ++Begin;
827 else if (UnmatchedOpeningBraceCount > 0)
828 --UnmatchedOpeningBraceCount;
829 else
830 return End;
831 }
832 break;
833 case '"':
834 if (UnmatchedOpeningBraceCount > 0)
835 break;
836 // "" within a verbatim string is an escaped double quote: skip it.
837 if (Verbatim && Repeated()) {
838 ++Begin;
839 break;
840 }
841 return Begin;
842 }
843 }
844
845 return End;
846}
847
848void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
849 FormatToken *CSharpStringLiteral = Tokens.back();
850
851 if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
852 return;
853
854 auto &TokenText = CSharpStringLiteral->TokenText;
855
856 bool Verbatim = false;
857 bool Interpolated = false;
858 if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
859 Verbatim = true;
860 Interpolated = true;
861 } else if (TokenText.starts_with(R"(@")")) {
862 Verbatim = true;
863 } else if (TokenText.starts_with(R"($")")) {
864 Interpolated = true;
865 }
866
867 // Deal with multiline strings.
868 if (!Verbatim && !Interpolated)
869 return;
870
871 const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
872 const char *Offset = StrBegin;
873 Offset += Verbatim && Interpolated ? 3 : 2;
874
875 const auto End = Lex->getBuffer().end();
876 Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
877
878 // Make no attempt to format code properly if a verbatim string is
879 // unterminated.
880 if (Offset >= End)
881 return;
882
883 StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
884 TokenText = LiteralText;
885
886 // Adjust width for potentially multiline string literals.
887 size_t FirstBreak = LiteralText.find('\n');
888 StringRef FirstLineText = FirstBreak == StringRef::npos
889 ? LiteralText
890 : LiteralText.substr(0, FirstBreak);
891 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
892 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
893 Encoding);
894 size_t LastBreak = LiteralText.rfind('\n');
895 if (LastBreak != StringRef::npos) {
896 CSharpStringLiteral->IsMultiline = true;
897 unsigned StartColumn = 0;
898 CSharpStringLiteral->LastLineColumnWidth =
899 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
900 StartColumn, Style.TabWidth, Encoding);
901 }
902
903 assert(Offset < End);
904 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
905}
906
907void FormatTokenLexer::handleTableGenMultilineString() {
908 FormatToken *MultiLineString = Tokens.back();
909 if (MultiLineString->isNot(TT_TableGenMultiLineString))
910 return;
911
912 auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
913 // "}]" is the end of multi line string.
914 auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
915 if (CloseOffset == StringRef::npos)
916 return;
917 auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
918 MultiLineString->TokenText = Text;
919 resetLexer(SourceMgr.getFileOffset(
920 Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
921 auto FirstLineText = Text;
922 auto FirstBreak = Text.find('\n');
923 // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
924 if (FirstBreak != StringRef::npos) {
925 MultiLineString->IsMultiline = true;
926 FirstLineText = Text.substr(0, FirstBreak + 1);
927 // LastLineColumnWidth holds the width of the last line.
928 auto LastBreak = Text.rfind('\n');
929 MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
930 Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
931 Style.TabWidth, Encoding);
932 }
933 // ColumnWidth holds only the width of the first line.
934 MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
935 FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
936}
937
938void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
939 FormatToken *Tok = Tokens.back();
940 // TableGen identifiers can begin with digits. Such tokens are lexed as
941 // numeric_constant now.
942 if (Tok->isNot(tok::numeric_constant))
943 return;
944 StringRef Text = Tok->TokenText;
945 // The following check is based on llvm::TGLexer::LexToken.
946 // That lexes the token as a number if any of the following holds:
947 // 1. It starts with '+', '-'.
948 // 2. All the characters are digits.
949 // 3. The first non-digit character is 'b', and the next is '0' or '1'.
950 // 4. The first non-digit character is 'x', and the next is a hex digit.
951 // Note that in the case 3 and 4, if the next character does not exists in
952 // this token, the token is an identifier.
953 if (Text.empty() || Text[0] == '+' || Text[0] == '-')
954 return;
955 const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
956 // All the characters are digits
957 if (NonDigitPos == StringRef::npos)
958 return;
959 char FirstNonDigit = Text[NonDigitPos];
960 if (NonDigitPos < Text.size() - 1) {
961 char TheNext = Text[NonDigitPos + 1];
962 // Regarded as a binary number.
963 if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
964 return;
965 // Regarded as hex number.
966 if (FirstNonDigit == 'x' && isxdigit(TheNext))
967 return;
968 }
969 if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
970 // This is actually an identifier in TableGen.
971 Tok->Tok.setKind(tok::identifier);
972 Tok->Tok.setIdentifierInfo(nullptr);
973 }
974}
975
976void FormatTokenLexer::handleTemplateStrings() {
977 FormatToken *BacktickToken = Tokens.back();
978
979 if (BacktickToken->is(tok::l_brace)) {
980 StateStack.push(LexerState::NORMAL);
981 return;
982 }
983 if (BacktickToken->is(tok::r_brace)) {
984 if (StateStack.size() == 1)
985 return;
986 StateStack.pop();
987 if (StateStack.top() != LexerState::TEMPLATE_STRING)
988 return;
989 // If back in TEMPLATE_STRING, fallthrough and continue parsing the
990 } else if (BacktickToken->is(tok::unknown) &&
991 BacktickToken->TokenText == "`") {
992 StateStack.push(LexerState::TEMPLATE_STRING);
993 } else {
994 return; // Not actually a template
995 }
996
997 // 'Manually' lex ahead in the current file buffer.
998 const char *Offset = Lex->getBufferLocation();
999 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
1000 for (; Offset != Lex->getBuffer().end(); ++Offset) {
1001 if (Offset[0] == '`') {
1002 StateStack.pop();
1003 ++Offset;
1004 break;
1005 }
1006 if (Offset[0] == '\\') {
1007 ++Offset; // Skip the escaped character.
1008 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
1009 Offset[1] == '{') {
1010 // '${' introduces an expression interpolation in the template string.
1011 StateStack.push(LexerState::NORMAL);
1012 Offset += 2;
1013 break;
1014 }
1015 }
1016
1017 StringRef LiteralText(TmplBegin, Offset - TmplBegin);
1018 BacktickToken->setType(TT_TemplateString);
1019 BacktickToken->Tok.setKind(tok::string_literal);
1020 BacktickToken->TokenText = LiteralText;
1021
1022 // Adjust width for potentially multiline string literals.
1023 size_t FirstBreak = LiteralText.find('\n');
1024 StringRef FirstLineText = FirstBreak == StringRef::npos
1025 ? LiteralText
1026 : LiteralText.substr(0, FirstBreak);
1027 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
1028 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
1029 size_t LastBreak = LiteralText.rfind('\n');
1030 if (LastBreak != StringRef::npos) {
1031 BacktickToken->IsMultiline = true;
1032 unsigned StartColumn = 0; // The template tail spans the entire line.
1033 BacktickToken->LastLineColumnWidth =
1034 encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
1035 StartColumn, Style.TabWidth, Encoding);
1036 }
1037
1038 SourceLocation loc = Lex->getSourceLocation(Offset);
1039 resetLexer(SourceMgr.getFileOffset(loc));
1040}
1041
1042void FormatTokenLexer::tryParsePythonComment() {
1043 FormatToken *HashToken = Tokens.back();
1044 if (!HashToken->isOneOf(tok::hash, tok::hashhash))
1045 return;
1046 // Turn the remainder of this line into a comment.
1047 const char *CommentBegin =
1048 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
1049 size_t From = CommentBegin - Lex->getBuffer().begin();
1050 size_t To = Lex->getBuffer().find_first_of('\n', From);
1051 if (To == StringRef::npos)
1052 To = Lex->getBuffer().size();
1053 size_t Len = To - From;
1054 HashToken->setType(TT_LineComment);
1055 HashToken->Tok.setKind(tok::comment);
1056 HashToken->TokenText = Lex->getBuffer().substr(From, Len);
1057 SourceLocation Loc = To < Lex->getBuffer().size()
1058 ? Lex->getSourceLocation(CommentBegin + Len)
1059 : SourceMgr.getLocForEndOfFile(ID);
1060 resetLexer(SourceMgr.getFileOffset(Loc));
1061}
1062
1063bool FormatTokenLexer::tryMerge_TMacro() {
1064 if (Tokens.size() < 4)
1065 return false;
1066 FormatToken *Last = Tokens.back();
1067 if (Last->isNot(tok::r_paren))
1068 return false;
1069
1070 FormatToken *String = Tokens[Tokens.size() - 2];
1071 if (String->isNot(tok::string_literal) || String->IsMultiline)
1072 return false;
1073
1074 if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
1075 return false;
1076
1077 FormatToken *Macro = Tokens[Tokens.size() - 4];
1078 if (Macro->TokenText != "_T")
1079 return false;
1080
1081 const char *Start = Macro->TokenText.data();
1082 const char *End = Last->TokenText.data() + Last->TokenText.size();
1083 String->TokenText = StringRef(Start, End - Start);
1084 String->IsFirst = Macro->IsFirst;
1085 String->LastNewlineOffset = Macro->LastNewlineOffset;
1086 String->WhitespaceRange = Macro->WhitespaceRange;
1087 String->OriginalColumn = Macro->OriginalColumn;
1088 String->ColumnWidth = encoding::columnWidthWithTabs(
1089 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
1090 String->NewlinesBefore = Macro->NewlinesBefore;
1091 String->HasUnescapedNewline = Macro->HasUnescapedNewline;
1092
1093 Tokens.pop_back();
1094 Tokens.pop_back();
1095 Tokens.pop_back();
1096 Tokens.back() = String;
1097 if (FirstInLineIndex >= Tokens.size())
1098 FirstInLineIndex = Tokens.size() - 1;
1099 return true;
1100}
1101
1102bool FormatTokenLexer::tryMergeConflictMarkers() {
1103 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1104 return false;
1105
1106 // Conflict lines look like:
1107 // <marker> <text from the vcs>
1108 // For example:
1109 // >>>>>>> /file/in/file/system at revision 1234
1110 //
1111 // We merge all tokens in a line that starts with a conflict marker
1112 // into a single token with a special token type that the unwrapped line
1113 // parser will use to correctly rebuild the underlying code.
1114
1115 FileID ID;
1116 // Get the position of the first token in the line.
1117 unsigned FirstInLineOffset;
1118 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1119 Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1120 StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1121 // Calculate the offset of the start of the current line.
1122 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1123 if (LineOffset == StringRef::npos)
1124 LineOffset = 0;
1125 else
1126 ++LineOffset;
1127
1128 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1129 StringRef LineStart;
1130 if (FirstSpace == StringRef::npos)
1131 LineStart = Buffer.substr(LineOffset);
1132 else
1133 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1134
1135 TokenType Type = TT_Unknown;
1136 if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1137 Type = TT_ConflictStart;
1138 } else if (LineStart == "|||||||" || LineStart == "=======" ||
1139 LineStart == "====") {
1140 Type = TT_ConflictAlternative;
1141 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1142 Type = TT_ConflictEnd;
1143 }
1144
1145 if (Type != TT_Unknown) {
1146 FormatToken *Next = Tokens.back();
1147
1148 Tokens.resize(FirstInLineIndex + 1);
1149 // We do not need to build a complete token here, as we will skip it
1150 // during parsing anyway (as we must not touch whitespace around conflict
1151 // markers).
1152 Tokens.back()->setType(Type);
1153 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1154
1155 Tokens.push_back(Next);
1156 return true;
1157 }
1158
1159 return false;
1160}
1161
1162FormatToken *FormatTokenLexer::getStashedToken() {
1163 // Create a synthesized second '>' or '<' token.
1164 Token Tok = FormatTok->Tok;
1165 StringRef TokenText = FormatTok->TokenText;
1166
1167 unsigned OriginalColumn = FormatTok->OriginalColumn;
1168 FormatTok = new (Allocator.Allocate()) FormatToken;
1169 FormatTok->Tok = Tok;
1170 SourceLocation TokLocation =
1171 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1172 FormatTok->Tok.setLocation(TokLocation);
1173 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1174 FormatTok->TokenText = TokenText;
1175 FormatTok->ColumnWidth = 1;
1176 FormatTok->OriginalColumn = OriginalColumn + 1;
1177
1178 return FormatTok;
1179}
1180
1181/// Truncate the current token to the new length and make the lexer continue
1182/// from the end of the truncated token. Used for other languages that have
1183/// different token boundaries, like JavaScript in which a comment ends at a
1184/// line break regardless of whether the line break follows a backslash. Also
1185/// used to set the lexer to the end of whitespace if the lexer regards
1186/// whitespace and an unrecognized symbol as one token.
1187void FormatTokenLexer::truncateToken(size_t NewLen) {
1188 assert(NewLen <= FormatTok->TokenText.size());
1189 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1190 Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1191 FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1192 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1193 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1194 Encoding);
1195 FormatTok->Tok.setLength(NewLen);
1196}
1197
1198/// Count the length of leading whitespace in a token.
1199static size_t countLeadingWhitespace(StringRef Text) {
1200 // Basically counting the length matched by this regex.
1201 // "^([\n\r\f\v \t]|\\\\[\n\r])+"
1202 // Directly using the regex turned out to be slow. With the regex
1203 // version formatting all files in this directory took about 1.25
1204 // seconds. This version took about 0.5 seconds.
1205 const unsigned char *const Begin = Text.bytes_begin();
1206 const unsigned char *const End = Text.bytes_end();
1207 const unsigned char *Cur = Begin;
1208 while (Cur < End) {
1209 if (isWhitespace(Cur[0])) {
1210 ++Cur;
1211 } else if (Cur[0] == '\\') {
1212 // A backslash followed by optional horizontal whitespaces (P22232R2) and
1213 // then a newline always escapes the newline.
1214 // The source has a null byte at the end. So the end of the entire input
1215 // isn't reached yet. Also the lexer doesn't break apart an escaped
1216 // newline.
1217 const auto *Lookahead = Cur + 1;
1218 while (isHorizontalWhitespace(*Lookahead))
1219 ++Lookahead;
1220 // No line splice found; the backslash is a token.
1221 if (!isVerticalWhitespace(*Lookahead))
1222 break;
1223 // Splice found, consume it.
1224 Cur = Lookahead + 1;
1225 } else {
1226 break;
1227 }
1228 }
1229 return Cur - Begin;
1230}
1231
1232FormatToken *FormatTokenLexer::getNextToken() {
1233 if (StateStack.top() == LexerState::TOKEN_STASHED) {
1234 StateStack.pop();
1235 return getStashedToken();
1236 }
1237
1238 FormatTok = new (Allocator.Allocate()) FormatToken;
1239 readRawToken(*FormatTok);
1240 SourceLocation WhitespaceStart =
1241 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1242 FormatTok->IsFirst = IsFirstToken;
1243 IsFirstToken = false;
1244
1245 // Consume and record whitespace until we find a significant token.
1246 // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1247 // followed by a symbol such as backtick. Those symbols may be
1248 // significant in other languages.
1249 unsigned WhitespaceLength = TrailingWhitespace;
1250 while (FormatTok->isNot(tok::eof)) {
1251 auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1252 if (LeadingWhitespace == 0)
1253 break;
1254 if (LeadingWhitespace < FormatTok->TokenText.size())
1255 truncateToken(LeadingWhitespace);
1256 StringRef Text = FormatTok->TokenText;
1257 bool InEscape = false;
1258 for (int i = 0, e = Text.size(); i != e; ++i) {
1259 switch (Text[i]) {
1260 case '\r':
1261 // If this is a CRLF sequence, break here and the LF will be handled on
1262 // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1263 // the same as a single LF.
1264 if (i + 1 < e && Text[i + 1] == '\n')
1265 break;
1266 [[fallthrough]];
1267 case '\n':
1268 ++FormatTok->NewlinesBefore;
1269 if (!InEscape)
1270 FormatTok->HasUnescapedNewline = true;
1271 else
1272 InEscape = false;
1273 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1274 Column = 0;
1275 break;
1276 case '\f':
1277 if (Style.KeepFormFeed && !FormatTok->HasFormFeedBefore &&
1278 // The form feed is immediately preceded and followed by a newline.
1279 i > 0 && Text[i - 1] == '\n' &&
1280 ((i + 1 < e && Text[i + 1] == '\n') ||
1281 (i + 2 < e && Text[i + 1] == '\r' && Text[i + 2] == '\n'))) {
1282 FormatTok->HasFormFeedBefore = true;
1283 }
1284 [[fallthrough]];
1285 case '\v':
1286 Column = 0;
1287 break;
1288 case ' ':
1289 ++Column;
1290 break;
1291 case '\t':
1292 Column +=
1293 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1294 break;
1295 case '\\':
1296 // The code preceding the loop and in the countLeadingWhitespace
1297 // function guarantees that Text is entirely whitespace, not including
1298 // comments but including escaped newlines. So the character shows up,
1299 // then it has to be in an escape sequence.
1300 assert([&]() -> bool {
1301 size_t j = i + 1;
1302 while (j < Text.size() && isHorizontalWhitespace(Text[j]))
1303 ++j;
1304 return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r');
1305 }());
1306 InEscape = true;
1307 break;
1308 default:
1309 // This shouldn't happen.
1310 assert(false);
1311 break;
1312 }
1313 }
1314 WhitespaceLength += Text.size();
1315 readRawToken(*FormatTok);
1316 }
1317
1318 if (FormatTok->is(tok::unknown))
1319 FormatTok->setType(TT_ImplicitStringLiteral);
1320
1321 const bool IsCpp = Style.isCpp();
1322
1323 // JavaScript and Java do not allow to escape the end of the line with a
1324 // backslash. Backslashes are syntax errors in plain source, but can occur in
1325 // comments. When a single line comment ends with a \, it'll cause the next
1326 // line of code to be lexed as a comment, breaking formatting. The code below
1327 // finds comments that contain a backslash followed by a line break, truncates
1328 // the comment token at the backslash, and resets the lexer to restart behind
1329 // the backslash.
1330 if (const auto Text = FormatTok->TokenText;
1331 Text.starts_with("//") &&
1332 (IsCpp || Style.isJavaScript() || Style.isJava())) {
1333 assert(FormatTok->is(tok::comment));
1334 for (auto Pos = Text.find('\\'); Pos++ != StringRef::npos;
1335 Pos = Text.find('\\', Pos)) {
1336 if (Pos < Text.size() && Text[Pos] == '\n' &&
1337 (!IsCpp || Text.substr(Pos + 1).ltrim().starts_with("//"))) {
1338 truncateToken(Pos);
1339 break;
1340 }
1341 }
1342 }
1343
1344 if (Style.isVerilog()) {
1345 static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1346 SmallVector<StringRef, 1> Matches;
1347 // Verilog uses the backtick instead of the hash for preprocessor stuff.
1348 // And it uses the hash for delays and parameter lists. In order to continue
1349 // using `tok::hash` in other places, the backtick gets marked as the hash
1350 // here. And in order to tell the backtick and hash apart for
1351 // Verilog-specific stuff, the hash becomes an identifier.
1352 if (FormatTok->is(tok::numeric_constant)) {
1353 // In Verilog the quote is not part of a number.
1354 auto Quote = FormatTok->TokenText.find('\'');
1355 if (Quote != StringRef::npos)
1356 truncateToken(Quote);
1357 } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1358 FormatTok->Tok.setKind(tok::raw_identifier);
1359 } else if (FormatTok->is(tok::raw_identifier)) {
1360 if (FormatTok->TokenText == "`") {
1361 FormatTok->Tok.setIdentifierInfo(nullptr);
1362 FormatTok->Tok.setKind(tok::hash);
1363 } else if (FormatTok->TokenText == "``") {
1364 FormatTok->Tok.setIdentifierInfo(nullptr);
1365 FormatTok->Tok.setKind(tok::hashhash);
1366 } else if (!Tokens.empty() && Tokens.back()->is(Keywords.kw_apostrophe) &&
1367 NumberBase.match(FormatTok->TokenText, &Matches)) {
1368 // In Verilog in a based number literal like `'b10`, there may be
1369 // whitespace between `'b` and `10`. Therefore we handle the base and
1370 // the rest of the number literal as two tokens. But if there is no
1371 // space in the input code, we need to manually separate the two parts.
1372 truncateToken(Matches[0].size());
1373 FormatTok->setFinalizedType(TT_VerilogNumberBase);
1374 }
1375 }
1376 }
1377
1378 FormatTok->WhitespaceRange = SourceRange(
1379 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1380
1381 FormatTok->OriginalColumn = Column;
1382
1383 TrailingWhitespace = 0;
1384 if (FormatTok->is(tok::comment)) {
1385 // FIXME: Add the trimmed whitespace to Column.
1386 StringRef UntrimmedText = FormatTok->TokenText;
1387 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1388 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1389 } else if (FormatTok->is(tok::raw_identifier)) {
1390 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1391 FormatTok->Tok.setIdentifierInfo(&Info);
1392 FormatTok->Tok.setKind(Info.getTokenID());
1393 if (Style.isJava() &&
1394 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1395 tok::kw_operator)) {
1396 FormatTok->Tok.setKind(tok::identifier);
1397 } else if (Style.isJavaScript() &&
1398 FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1399 tok::kw_operator)) {
1400 FormatTok->Tok.setKind(tok::identifier);
1401 } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1402 FormatTok->Tok.setKind(tok::identifier);
1403 }
1404 } else if (const bool Greater = FormatTok->is(tok::greatergreater);
1405 Greater || FormatTok->is(tok::lessless)) {
1406 FormatTok->Tok.setKind(Greater ? tok::greater : tok::less);
1407 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1408 ++Column;
1409 StateStack.push(LexerState::TOKEN_STASHED);
1410 } else if (Style.isJava() && FormatTok->is(tok::string_literal)) {
1411 tryParseJavaTextBlock();
1412 }
1413
1414 if (Style.isVerilog() && !Tokens.empty() &&
1415 Tokens.back()->is(TT_VerilogNumberBase) &&
1416 FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1417 // Mark the number following a base like `'h?a0` as a number.
1418 FormatTok->Tok.setKind(tok::numeric_constant);
1419 }
1420
1421 // Now FormatTok is the next non-whitespace token.
1422
1423 StringRef Text = FormatTok->TokenText;
1424 size_t FirstNewlinePos = Text.find('\n');
1425 if (FirstNewlinePos == StringRef::npos) {
1426 // FIXME: ColumnWidth actually depends on the start column, we need to
1427 // take this into account when the token is moved.
1428 FormatTok->ColumnWidth =
1429 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1430 Column += FormatTok->ColumnWidth;
1431 } else {
1432 FormatTok->IsMultiline = true;
1433 // FIXME: ColumnWidth actually depends on the start column, we need to
1434 // take this into account when the token is moved.
1435 FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1436 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1437
1438 // The last line of the token always starts in column 0.
1439 // Thus, the length can be precomputed even in the presence of tabs.
1440 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1441 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1442 Column = FormatTok->LastLineColumnWidth;
1443 }
1444
1445 if (IsCpp) {
1446 auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1447 auto it = Macros.find(Identifier);
1448 if ((Tokens.empty() || !Tokens.back()->Tok.getIdentifierInfo() ||
1449 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() !=
1450 tok::pp_define) &&
1451 it != Macros.end()) {
1452 FormatTok->setType(it->second);
1453 if (it->second == TT_IfMacro) {
1454 // The lexer token currently has type tok::kw_unknown. However, for this
1455 // substitution to be treated correctly in the TokenAnnotator, faking
1456 // the tok value seems to be needed. Not sure if there's a more elegant
1457 // way.
1458 FormatTok->Tok.setKind(tok::kw_if);
1459 }
1460 } else if (FormatTok->is(tok::identifier)) {
1461 if (MacroBlockBeginRegex.match(Text))
1462 FormatTok->setType(TT_MacroBlockBegin);
1463 else if (MacroBlockEndRegex.match(Text))
1464 FormatTok->setType(TT_MacroBlockEnd);
1465 else if (MacrosSkippedByRemoveParentheses.contains(Identifier))
1466 FormatTok->setFinalizedType(TT_FunctionLikeMacro);
1467 else if (TemplateNames.contains(Identifier))
1468 FormatTok->setFinalizedType(TT_TemplateName);
1469 else if (TypeNames.contains(Identifier))
1470 FormatTok->setFinalizedType(TT_TypeName);
1471 else if (VariableTemplates.contains(Identifier))
1472 FormatTok->setFinalizedType(TT_VariableTemplate);
1473 }
1474 }
1475
1476 return FormatTok;
1477}
1478
1479bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1480 const char *Start = Lex->getBufferLocation();
1481 size_t Len;
1482 switch (Start[0]) {
1483 // In Verilog the quote is not a character literal.
1484 case '\'':
1485 Len = 1;
1486 break;
1487 // Make the backtick and double backtick identifiers to match against them
1488 // more easily.
1489 case '`':
1490 if (Start[1] == '`')
1491 Len = 2;
1492 else
1493 Len = 1;
1494 break;
1495 // In Verilog an escaped identifier starts with a backslash and ends with
1496 // whitespace. Unless that whitespace is an escaped newline.
1497 // FIXME: If there is an escaped newline in the middle of an escaped
1498 // identifier, allow for pasting the two lines together, But escaped
1499 // identifiers usually occur only in generated code anyway.
1500 case '\\':
1501 // A backslash can also begin an escaped newline outside of an escaped
1502 // identifier.
1503 if (Start[1] == '\r' || Start[1] == '\n')
1504 return false;
1505 Len = 1;
1506 while (Start[Len] != '\0' && Start[Len] != '\f' && Start[Len] != '\n' &&
1507 Start[Len] != '\r' && Start[Len] != '\t' && Start[Len] != '\v' &&
1508 Start[Len] != ' ') {
1509 // There is a null byte at the end of the buffer, so we don't have to
1510 // check whether the next byte is within the buffer.
1511 if (Start[Len] == '\\' && Start[Len + 1] == '\r' &&
1512 Start[Len + 2] == '\n') {
1513 Len += 3;
1514 } else if (Start[Len] == '\\' &&
1515 (Start[Len + 1] == '\r' || Start[Len + 1] == '\n')) {
1516 Len += 2;
1517 } else {
1518 Len += 1;
1519 }
1520 }
1521 break;
1522 default:
1523 return false;
1524 }
1525
1526 // The kind has to be an identifier so we can match it against those defined
1527 // in Keywords. The kind has to be set before the length because the setLength
1528 // function checks that the kind is not an annotation.
1529 Tok.setKind(tok::raw_identifier);
1530 Tok.setLength(Len);
1531 Tok.setLocation(Lex->getSourceLocation(Start, Len));
1532 Tok.setRawIdentifierData(Start);
1533 Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1534 return true;
1535}
1536
1537void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1538 // For Verilog, first see if there is a special token, and fall back to the
1539 // normal lexer if there isn't one.
1540 if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1541 Lex->LexFromRawLexer(Tok.Tok);
1542 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1543 Tok.Tok.getLength());
1544 // For formatting, treat unterminated string literals like normal string
1545 // literals.
1546 if (Tok.is(tok::unknown)) {
1547 if (Tok.TokenText.starts_with("\"")) {
1548 Tok.Tok.setKind(tok::string_literal);
1549 Tok.IsUnterminatedLiteral = true;
1550 } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1551 Tok.Tok.setKind(tok::string_literal);
1552 }
1553 }
1554
1555 if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1556 Tok.Tok.setKind(tok::string_literal);
1557
1558 if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1559 FormattingDisabled = false;
1560
1561 Tok.Finalized = FormattingDisabled;
1562
1563 if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1564 FormattingDisabled = true;
1565}
1566
1567void FormatTokenLexer::resetLexer(unsigned Offset) {
1568 StringRef Buffer = SourceMgr.getBufferData(ID);
1569 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1570 Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1571 Lex->SetKeepWhitespaceMode(true);
1572 TrailingWhitespace = 0;
1573}
1574
1575} // namespace format
1576} // namespace clang
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
FormatToken()
Token Tok
The Token.
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
FormatToken * Next
The next token in the unwrapped line.
bool is(tok::TokenKind Kind) const
Various functions to configurably format source code.
#define X(type, name)
Definition Value.h:97
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
Token - This structure provides full information about a lexed token.
Definition Token.h:36
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition Token.h:134
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
uint32_t Literal
Literals are represented as positive integers.
Definition CNFFormula.h:35
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition Format.cpp:4465
bool isClangFormatOn(StringRef Comment)
Definition Format.cpp:4461
TokenType
Determines the semantic type of a syntactic token, e.g.
LangOptions getFormattingLangOpts(const FormatStyle &Style)
Definition Format.cpp:4106
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition CharInfo.h:99
std::vector< std::string > Macros
A list of macros of the form <definition>=<expansion> .
Definition Format.h:3489
@ TemplateName
The identifier is a template name. FIXME: Add an annotation for that.
Definition Parser.h:61
nullptr
This class represents a compute construct, representing a 'Kind' of β€˜parallel’, 'serial',...
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition Format.h:5308
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition CharInfo.h:91
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition CharInfo.h:108
@ Keyword
The name has been typo-corrected to a keyword.
Definition Sema.h:560
@ Type
The name was classified as a type.
Definition Sema.h:562
std::vector< std::string > MacrosSkippedByRemoveParentheses
A vector of function-like macros whose invocations should be skipped by RemoveParentheses.
Definition Format.h:3494
std::vector< std::string > TemplateNames
A vector of non-keyword identifiers that should be interpreted as template names.
Definition Format.h:5298
std::vector< std::string > VariableTemplates
A vector of non-keyword identifiers that should be interpreted as variable template names.
Definition Format.h:5359
#define true
Definition stdbool.h:25
A wrapper around a Token storing information about the whitespace characters preceding it.
bool isNot(T Kind) const
StringRef TokenText
The raw text of the token.
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
unsigned NewlinesBefore
The number of newlines immediately before the Token.
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
bool HasFormFeedBefore
Has "\n\f\n" or "\n\f\r\n" before TokenText.
unsigned IsFirst
Indicates that this is the first token of the file.