1d2d3ebb8SDimitry Andric //===- ScriptLexer.cpp ----------------------------------------------------===//
2d2d3ebb8SDimitry Andric //
3f1e1c239SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4f1e1c239SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5f1e1c239SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6d2d3ebb8SDimitry Andric //
7d2d3ebb8SDimitry Andric //===----------------------------------------------------------------------===//
8d2d3ebb8SDimitry Andric //
9d2d3ebb8SDimitry Andric // This file defines a lexer for the linker script.
10d2d3ebb8SDimitry Andric //
11d2d3ebb8SDimitry Andric // The linker script's grammar is not complex but ambiguous due to the
12d2d3ebb8SDimitry Andric // lack of the formal specification of the language. What we are trying to
13d2d3ebb8SDimitry Andric // do in this and other files in LLD is to make a "reasonable" linker
14d2d3ebb8SDimitry Andric // script processor.
15d2d3ebb8SDimitry Andric //
16d2d3ebb8SDimitry Andric // Among simplicity, compatibility and efficiency, we put the most
17d2d3ebb8SDimitry Andric // emphasis on simplicity when we wrote this lexer. Compatibility with the
18d2d3ebb8SDimitry Andric // GNU linkers is important, but we did not try to clone every tiny corner
19d2d3ebb8SDimitry Andric // case of their lexers, as even ld.bfd and ld.gold are subtly different
20d2d3ebb8SDimitry Andric // in various corner cases. We do not care much about efficiency because
21d2d3ebb8SDimitry Andric // the time spent in parsing linker scripts is usually negligible.
22d2d3ebb8SDimitry Andric //
23d2d3ebb8SDimitry Andric // Overall, this lexer works fine for most linker scripts. There might
24d2d3ebb8SDimitry Andric // be room for improving compatibility, but that's probably not at the
25d2d3ebb8SDimitry Andric // top of our todo list.
26d2d3ebb8SDimitry Andric //
27d2d3ebb8SDimitry Andric //===----------------------------------------------------------------------===//
28d2d3ebb8SDimitry Andric
29d2d3ebb8SDimitry Andric #include "ScriptLexer.h"
30eb1ff93dSDimitry Andric #include "lld/Common/ErrorHandler.h"
31d2d3ebb8SDimitry Andric #include "llvm/ADT/Twine.h"
32145449b1SDimitry Andric #include "llvm/Support/ErrorHandling.h"
33145449b1SDimitry Andric #include <algorithm>
34d2d3ebb8SDimitry Andric
35d2d3ebb8SDimitry Andric using namespace llvm;
36cfca06d7SDimitry Andric using namespace lld;
37cfca06d7SDimitry Andric using namespace lld::elf;
38d2d3ebb8SDimitry Andric
39d2d3ebb8SDimitry Andric // Returns a whole line containing the current token.
getLine()40d2d3ebb8SDimitry Andric StringRef ScriptLexer::getLine() {
41f1e1c239SDimitry Andric StringRef s = getCurrentMB().getBuffer();
42f1e1c239SDimitry Andric StringRef tok = tokens[pos - 1];
43d2d3ebb8SDimitry Andric
44f1e1c239SDimitry Andric size_t pos = s.rfind('\n', tok.data() - s.data());
45f1e1c239SDimitry Andric if (pos != StringRef::npos)
46f1e1c239SDimitry Andric s = s.substr(pos + 1);
47f1e1c239SDimitry Andric return s.substr(0, s.find_first_of("\r\n"));
48d2d3ebb8SDimitry Andric }
49d2d3ebb8SDimitry Andric
50d2d3ebb8SDimitry Andric // Returns 1-based line number of the current token.
getLineNumber()51d2d3ebb8SDimitry Andric size_t ScriptLexer::getLineNumber() {
52cfca06d7SDimitry Andric if (pos == 0)
53cfca06d7SDimitry Andric return 1;
54f1e1c239SDimitry Andric StringRef s = getCurrentMB().getBuffer();
55f1e1c239SDimitry Andric StringRef tok = tokens[pos - 1];
56344a3780SDimitry Andric const size_t tokOffset = tok.data() - s.data();
57344a3780SDimitry Andric
58344a3780SDimitry Andric // For the first token, or when going backwards, start from the beginning of
59344a3780SDimitry Andric // the buffer. If this token is after the previous token, start from the
60344a3780SDimitry Andric // previous token.
61344a3780SDimitry Andric size_t line = 1;
62344a3780SDimitry Andric size_t start = 0;
63344a3780SDimitry Andric if (lastLineNumberOffset > 0 && tokOffset >= lastLineNumberOffset) {
64344a3780SDimitry Andric start = lastLineNumberOffset;
65344a3780SDimitry Andric line = lastLineNumber;
66344a3780SDimitry Andric }
67344a3780SDimitry Andric
68344a3780SDimitry Andric line += s.substr(start, tokOffset - start).count('\n');
69344a3780SDimitry Andric
70344a3780SDimitry Andric // Store the line number of this token for reuse.
71344a3780SDimitry Andric lastLineNumberOffset = tokOffset;
72344a3780SDimitry Andric lastLineNumber = line;
73344a3780SDimitry Andric
74344a3780SDimitry Andric return line;
75d2d3ebb8SDimitry Andric }
76d2d3ebb8SDimitry Andric
77d2d3ebb8SDimitry Andric // Returns 0-based column number of the current token.
getColumnNumber()78d2d3ebb8SDimitry Andric size_t ScriptLexer::getColumnNumber() {
79f1e1c239SDimitry Andric StringRef tok = tokens[pos - 1];
80f1e1c239SDimitry Andric return tok.data() - getLine().data();
81d2d3ebb8SDimitry Andric }
82d2d3ebb8SDimitry Andric
getCurrentLocation()83d2d3ebb8SDimitry Andric std::string ScriptLexer::getCurrentLocation() {
84cfca06d7SDimitry Andric std::string filename = std::string(getCurrentMB().getBufferIdentifier());
85f1e1c239SDimitry Andric return (filename + ":" + Twine(getLineNumber())).str();
86d2d3ebb8SDimitry Andric }
87d2d3ebb8SDimitry Andric
ScriptLexer(MemoryBufferRef mb)88f1e1c239SDimitry Andric ScriptLexer::ScriptLexer(MemoryBufferRef mb) { tokenize(mb); }
89d2d3ebb8SDimitry Andric
90d2d3ebb8SDimitry Andric // We don't want to record cascading errors. Keep only the first one.
setError(const Twine & msg)91f1e1c239SDimitry Andric void ScriptLexer::setError(const Twine &msg) {
92eb1ff93dSDimitry Andric if (errorCount())
93d2d3ebb8SDimitry Andric return;
94d2d3ebb8SDimitry Andric
95f1e1c239SDimitry Andric std::string s = (getCurrentLocation() + ": " + msg).str();
96f1e1c239SDimitry Andric if (pos)
97f1e1c239SDimitry Andric s += "\n>>> " + getLine().str() + "\n>>> " +
98eb1ff93dSDimitry Andric std::string(getColumnNumber(), ' ') + "^";
99f1e1c239SDimitry Andric error(s);
100d2d3ebb8SDimitry Andric }
101d2d3ebb8SDimitry Andric
102d2d3ebb8SDimitry Andric // Split S into linker script tokens.
tokenize(MemoryBufferRef mb)103f1e1c239SDimitry Andric void ScriptLexer::tokenize(MemoryBufferRef mb) {
104f1e1c239SDimitry Andric std::vector<StringRef> vec;
105f1e1c239SDimitry Andric mbs.push_back(mb);
106f1e1c239SDimitry Andric StringRef s = mb.getBuffer();
107f1e1c239SDimitry Andric StringRef begin = s;
108d2d3ebb8SDimitry Andric
109d2d3ebb8SDimitry Andric for (;;) {
110f1e1c239SDimitry Andric s = skipSpace(s);
111f1e1c239SDimitry Andric if (s.empty())
112d2d3ebb8SDimitry Andric break;
113d2d3ebb8SDimitry Andric
114d2d3ebb8SDimitry Andric // Quoted token. Note that double-quote characters are parts of a token
115d2d3ebb8SDimitry Andric // because, in a glob match context, only unquoted tokens are interpreted
116d2d3ebb8SDimitry Andric // as glob patterns. Double-quoted tokens are literal patterns in that
117d2d3ebb8SDimitry Andric // context.
1187fa27ce4SDimitry Andric if (s.starts_with("\"")) {
119f1e1c239SDimitry Andric size_t e = s.find("\"", 1);
120f1e1c239SDimitry Andric if (e == StringRef::npos) {
121f1e1c239SDimitry Andric StringRef filename = mb.getBufferIdentifier();
122f1e1c239SDimitry Andric size_t lineno = begin.substr(0, s.data() - begin.data()).count('\n');
123f1e1c239SDimitry Andric error(filename + ":" + Twine(lineno + 1) + ": unclosed quote");
124d2d3ebb8SDimitry Andric return;
125d2d3ebb8SDimitry Andric }
126d2d3ebb8SDimitry Andric
127f1e1c239SDimitry Andric vec.push_back(s.take_front(e + 1));
128f1e1c239SDimitry Andric s = s.substr(e + 1);
129d2d3ebb8SDimitry Andric continue;
130d2d3ebb8SDimitry Andric }
131d2d3ebb8SDimitry Andric
132145449b1SDimitry Andric // Some operators form separate tokens.
1337fa27ce4SDimitry Andric if (s.starts_with("<<=") || s.starts_with(">>=")) {
134145449b1SDimitry Andric vec.push_back(s.substr(0, 3));
135145449b1SDimitry Andric s = s.substr(3);
136145449b1SDimitry Andric continue;
137145449b1SDimitry Andric }
1387fa27ce4SDimitry Andric if (s.size() > 1 && ((s[1] == '=' && strchr("*/+-<>&^|", s[0])) ||
139145449b1SDimitry Andric (s[0] == s[1] && strchr("<>&|", s[0])))) {
140f1e1c239SDimitry Andric vec.push_back(s.substr(0, 2));
141f1e1c239SDimitry Andric s = s.substr(2);
142ae1a339dSDimitry Andric continue;
143ae1a339dSDimitry Andric }
144ae1a339dSDimitry Andric
145d2d3ebb8SDimitry Andric // Unquoted token. This is more relaxed than tokens in C-like language,
146d2d3ebb8SDimitry Andric // so that you can write "file-name.cpp" as one bare token, for example.
147f1e1c239SDimitry Andric size_t pos = s.find_first_not_of(
148d2d3ebb8SDimitry Andric "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
149ae1a339dSDimitry Andric "0123456789_.$/\\~=+[]*?-!^:");
150d2d3ebb8SDimitry Andric
151d2d3ebb8SDimitry Andric // A character that cannot start a word (which is usually a
152d2d3ebb8SDimitry Andric // punctuation) forms a single character token.
153f1e1c239SDimitry Andric if (pos == 0)
154f1e1c239SDimitry Andric pos = 1;
155f1e1c239SDimitry Andric vec.push_back(s.substr(0, pos));
156f1e1c239SDimitry Andric s = s.substr(pos);
157d2d3ebb8SDimitry Andric }
158d2d3ebb8SDimitry Andric
159f1e1c239SDimitry Andric tokens.insert(tokens.begin() + pos, vec.begin(), vec.end());
160d2d3ebb8SDimitry Andric }
161d2d3ebb8SDimitry Andric
162d2d3ebb8SDimitry Andric // Skip leading whitespace characters or comments.
skipSpace(StringRef s)163f1e1c239SDimitry Andric StringRef ScriptLexer::skipSpace(StringRef s) {
164d2d3ebb8SDimitry Andric for (;;) {
1657fa27ce4SDimitry Andric if (s.starts_with("/*")) {
166f1e1c239SDimitry Andric size_t e = s.find("*/", 2);
167f1e1c239SDimitry Andric if (e == StringRef::npos) {
168b60736ecSDimitry Andric setError("unclosed comment in a linker script");
169d2d3ebb8SDimitry Andric return "";
170d2d3ebb8SDimitry Andric }
171f1e1c239SDimitry Andric s = s.substr(e + 2);
172d2d3ebb8SDimitry Andric continue;
173d2d3ebb8SDimitry Andric }
1747fa27ce4SDimitry Andric if (s.starts_with("#")) {
175f1e1c239SDimitry Andric size_t e = s.find('\n', 1);
176f1e1c239SDimitry Andric if (e == StringRef::npos)
177f1e1c239SDimitry Andric e = s.size() - 1;
178f1e1c239SDimitry Andric s = s.substr(e + 1);
179d2d3ebb8SDimitry Andric continue;
180d2d3ebb8SDimitry Andric }
181f1e1c239SDimitry Andric size_t size = s.size();
182f1e1c239SDimitry Andric s = s.ltrim();
183f1e1c239SDimitry Andric if (s.size() == size)
184f1e1c239SDimitry Andric return s;
185d2d3ebb8SDimitry Andric }
186d2d3ebb8SDimitry Andric }
187d2d3ebb8SDimitry Andric
188d2d3ebb8SDimitry Andric // An erroneous token is handled as if it were the last token before EOF.
atEOF()189f1e1c239SDimitry Andric bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
190d2d3ebb8SDimitry Andric
191d2d3ebb8SDimitry Andric // Split a given string as an expression.
192d2d3ebb8SDimitry Andric // This function returns "3", "*" and "5" for "3*5" for example.
tokenizeExpr(StringRef s)193f1e1c239SDimitry Andric static std::vector<StringRef> tokenizeExpr(StringRef s) {
1947fa27ce4SDimitry Andric StringRef ops = "!~*/+-<>?^:="; // List of operators
195d2d3ebb8SDimitry Andric
196d2d3ebb8SDimitry Andric // Quoted strings are literal strings, so we don't want to split it.
1977fa27ce4SDimitry Andric if (s.starts_with("\""))
198f1e1c239SDimitry Andric return {s};
199d2d3ebb8SDimitry Andric
200eb1ff93dSDimitry Andric // Split S with operators as separators.
201f1e1c239SDimitry Andric std::vector<StringRef> ret;
202f1e1c239SDimitry Andric while (!s.empty()) {
203f1e1c239SDimitry Andric size_t e = s.find_first_of(ops);
204d2d3ebb8SDimitry Andric
205d2d3ebb8SDimitry Andric // No need to split if there is no operator.
206f1e1c239SDimitry Andric if (e == StringRef::npos) {
207f1e1c239SDimitry Andric ret.push_back(s);
208d2d3ebb8SDimitry Andric break;
209d2d3ebb8SDimitry Andric }
210d2d3ebb8SDimitry Andric
211cfca06d7SDimitry Andric // Get a token before the operator.
212f1e1c239SDimitry Andric if (e != 0)
213f1e1c239SDimitry Andric ret.push_back(s.substr(0, e));
214d2d3ebb8SDimitry Andric
215f1e1c239SDimitry Andric // Get the operator as a token.
216f1e1c239SDimitry Andric // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
2177fa27ce4SDimitry Andric if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
2187fa27ce4SDimitry Andric s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
2197fa27ce4SDimitry Andric s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
220f1e1c239SDimitry Andric ret.push_back(s.substr(e, 2));
221f1e1c239SDimitry Andric s = s.substr(e + 2);
222eb1ff93dSDimitry Andric } else {
223f1e1c239SDimitry Andric ret.push_back(s.substr(e, 1));
224f1e1c239SDimitry Andric s = s.substr(e + 1);
225d2d3ebb8SDimitry Andric }
226eb1ff93dSDimitry Andric }
227f1e1c239SDimitry Andric return ret;
228d2d3ebb8SDimitry Andric }
229d2d3ebb8SDimitry Andric
230d2d3ebb8SDimitry Andric // In contexts where expressions are expected, the lexer should apply
231d2d3ebb8SDimitry Andric // different tokenization rules than the default one. By default,
232d2d3ebb8SDimitry Andric // arithmetic operator characters are regular characters, but in the
233d2d3ebb8SDimitry Andric // expression context, they should be independent tokens.
234d2d3ebb8SDimitry Andric //
235d2d3ebb8SDimitry Andric // For example, "foo*3" should be tokenized to "foo", "*" and "3" only
236d2d3ebb8SDimitry Andric // in the expression context.
237d2d3ebb8SDimitry Andric //
238d2d3ebb8SDimitry Andric // This function may split the current token into multiple tokens.
maybeSplitExpr()239d2d3ebb8SDimitry Andric void ScriptLexer::maybeSplitExpr() {
240f1e1c239SDimitry Andric if (!inExpr || errorCount() || atEOF())
241d2d3ebb8SDimitry Andric return;
242d2d3ebb8SDimitry Andric
243f1e1c239SDimitry Andric std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
244f1e1c239SDimitry Andric if (v.size() == 1)
245d2d3ebb8SDimitry Andric return;
246f1e1c239SDimitry Andric tokens.erase(tokens.begin() + pos);
247f1e1c239SDimitry Andric tokens.insert(tokens.begin() + pos, v.begin(), v.end());
248d2d3ebb8SDimitry Andric }
249d2d3ebb8SDimitry Andric
next()250d2d3ebb8SDimitry Andric StringRef ScriptLexer::next() {
251d2d3ebb8SDimitry Andric maybeSplitExpr();
252d2d3ebb8SDimitry Andric
253eb1ff93dSDimitry Andric if (errorCount())
254d2d3ebb8SDimitry Andric return "";
255d2d3ebb8SDimitry Andric if (atEOF()) {
256d2d3ebb8SDimitry Andric setError("unexpected EOF");
257d2d3ebb8SDimitry Andric return "";
258d2d3ebb8SDimitry Andric }
259f1e1c239SDimitry Andric return tokens[pos++];
260d2d3ebb8SDimitry Andric }
261d2d3ebb8SDimitry Andric
peek()262d2d3ebb8SDimitry Andric StringRef ScriptLexer::peek() {
263f1e1c239SDimitry Andric StringRef tok = next();
264eb1ff93dSDimitry Andric if (errorCount())
265d2d3ebb8SDimitry Andric return "";
266f1e1c239SDimitry Andric pos = pos - 1;
267f1e1c239SDimitry Andric return tok;
268d2d3ebb8SDimitry Andric }
269d2d3ebb8SDimitry Andric
consume(StringRef tok)270f1e1c239SDimitry Andric bool ScriptLexer::consume(StringRef tok) {
271ac9a064cSDimitry Andric if (next() == tok)
272d2d3ebb8SDimitry Andric return true;
273ac9a064cSDimitry Andric --pos;
274d2d3ebb8SDimitry Andric return false;
275d2d3ebb8SDimitry Andric }
276d2d3ebb8SDimitry Andric
277d2d3ebb8SDimitry Andric // Consumes Tok followed by ":". Space is allowed between Tok and ":".
consumeLabel(StringRef tok)278f1e1c239SDimitry Andric bool ScriptLexer::consumeLabel(StringRef tok) {
279f1e1c239SDimitry Andric if (consume((tok + ":").str()))
280d2d3ebb8SDimitry Andric return true;
281f1e1c239SDimitry Andric if (tokens.size() >= pos + 2 && tokens[pos] == tok &&
282f1e1c239SDimitry Andric tokens[pos + 1] == ":") {
283f1e1c239SDimitry Andric pos += 2;
284d2d3ebb8SDimitry Andric return true;
285d2d3ebb8SDimitry Andric }
286d2d3ebb8SDimitry Andric return false;
287d2d3ebb8SDimitry Andric }
288d2d3ebb8SDimitry Andric
skip()289d2d3ebb8SDimitry Andric void ScriptLexer::skip() { (void)next(); }
290d2d3ebb8SDimitry Andric
expect(StringRef expect)291f1e1c239SDimitry Andric void ScriptLexer::expect(StringRef expect) {
292eb1ff93dSDimitry Andric if (errorCount())
293d2d3ebb8SDimitry Andric return;
294f1e1c239SDimitry Andric StringRef tok = next();
295f1e1c239SDimitry Andric if (tok != expect)
296f1e1c239SDimitry Andric setError(expect + " expected, but got " + tok);
297d2d3ebb8SDimitry Andric }
298d2d3ebb8SDimitry Andric
299d2d3ebb8SDimitry Andric // Returns true if S encloses T.
encloses(StringRef s,StringRef t)300f1e1c239SDimitry Andric static bool encloses(StringRef s, StringRef t) {
301f1e1c239SDimitry Andric return s.bytes_begin() <= t.bytes_begin() && t.bytes_end() <= s.bytes_end();
302d2d3ebb8SDimitry Andric }
303d2d3ebb8SDimitry Andric
getCurrentMB()304d2d3ebb8SDimitry Andric MemoryBufferRef ScriptLexer::getCurrentMB() {
305d2d3ebb8SDimitry Andric // Find input buffer containing the current token.
306cfca06d7SDimitry Andric assert(!mbs.empty());
307cfca06d7SDimitry Andric if (pos == 0)
308cfca06d7SDimitry Andric return mbs.back();
309f1e1c239SDimitry Andric for (MemoryBufferRef mb : mbs)
310f1e1c239SDimitry Andric if (encloses(mb.getBuffer(), tokens[pos - 1]))
311f1e1c239SDimitry Andric return mb;
312d2d3ebb8SDimitry Andric llvm_unreachable("getCurrentMB: failed to find a token");
313d2d3ebb8SDimitry Andric }
314