aboutsummaryrefslogtreecommitdiff
path: root/compiler/src/lex
diff options
context:
space:
mode:
authorTolmachev Igor <me@igorek.dev>2026-05-09 20:47:04 +0300
committerTolmachev Igor <me@igorek.dev>2026-05-09 20:47:04 +0300
commit160b64427d79290a59ac48c9babca064232d8dfd (patch)
tree0c2cc79f0a266761866ff325abdd4f2f0c7e7301 /compiler/src/lex
parent6be28381d6081dfb3a1dc9d1ec15062b67ba1ef9 (diff)
downloadcrisp-160b64427d79290a59ac48c9babca064232d8dfd.tar.gz
crisp-160b64427d79290a59ac48c9babca064232d8dfd.zip
Make project structure more consistentdev
Diffstat (limited to 'compiler/src/lex')
-rw-r--r--compiler/src/lex/lexer.rs161
-rw-r--r--compiler/src/lex/mod.rs8
-rw-r--r--compiler/src/lex/tests.rs383
-rw-r--r--compiler/src/lex/token.rs10
4 files changed, 562 insertions, 0 deletions
diff --git a/compiler/src/lex/lexer.rs b/compiler/src/lex/lexer.rs
new file mode 100644
index 0000000..801d382
--- /dev/null
+++ b/compiler/src/lex/lexer.rs
@@ -0,0 +1,161 @@
1use crate::{
2 lex::Token,
3 span::{Pos, Span, Spanned},
4};
5
6fn is_terminator(ch: char) -> bool {
7 ch.is_whitespace() || matches!(ch, '(' | ')' | '\'' | '"' | ';')
8}
9
10pub struct Lexer<'a> {
11 input: &'a str,
12 cursor: usize,
13
14 line: usize,
15 column: usize,
16}
17
18impl<'a> Lexer<'a> {
19 pub fn new(input: &'a str) -> Self {
20 Self {
21 input,
22 cursor: 0,
23
24 line: 1,
25 column: 0,
26 }
27 }
28
29 fn rest(&self) -> &str {
30 &self.input[self.cursor..]
31 }
32
33 fn peek(&self) -> Option<char> {
34 self.rest().chars().next()
35 }
36
37 fn peek_nth(&self, n: usize) -> Option<char> {
38 self.rest().chars().nth(n)
39 }
40
41 fn consume(&mut self) -> Option<char> {
42 let ch = self.peek()?;
43
44 self.cursor += ch.len_utf8();
45 if ch == '\n' {
46 self.line += 1;
47 self.column = 0;
48 } else {
49 self.column += 1;
50 }
51
52 Some(ch)
53 }
54
55 fn next_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str {
56 let start = self.cursor;
57
58 while let Some(ch) = self.peek() {
59 if !predicate(ch) {
60 break;
61 }
62 self.consume();
63 }
64
65 &self.input[start..self.cursor]
66 }
67
68 fn next_atom(&mut self) -> &'a str {
69 self.next_while(|ch| !is_terminator(ch))
70 }
71
72 fn next_string(&mut self) -> Result<&'a str, &'a str> {
73 debug_assert_eq!(self.peek(), Some('"'));
74 self.consume();
75
76 let start = self.cursor;
77
78 while let Some(ch) = self.peek() {
79 match ch {
80 '"' => {
81 let string = &self.input[start..self.cursor];
82 self.consume();
83 return Ok(string);
84 }
85 '\n' => {
86 let string = &self.input[start..self.cursor];
87 self.consume();
88 return Err(string);
89 }
90 '\\' => {
91 self.consume();
92 self.consume();
93 }
94 _ => {
95 self.consume();
96 }
97 }
98 }
99
100 Err(&self.input[start..self.cursor])
101 }
102}
103
104impl<'a> Iterator for Lexer<'a> {
105 type Item = Spanned<Token<'a>>;
106
107 fn next(&mut self) -> Option<Self::Item> {
108 loop {
109 match self.peek()? {
110 ch if ch.is_whitespace() => {
111 self.next_while(char::is_whitespace);
112 }
113 ';' => {
114 self.next_while(|ch| ch != '\n');
115 }
116 _ => break,
117 }
118 }
119
120 let start = Pos::new(self.line, self.column, self.cursor);
121
122 let token = match self.peek()? {
123 '(' => {
124 self.consume();
125 Token::LeftPar
126 }
127 ')' => {
128 self.consume();
129 Token::RightPar
130 }
131 '\'' => {
132 self.consume();
133 Token::Quote
134 }
135
136 // Number
137 ch if ch.is_ascii_digit()
138 || ch == '.' && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit())
139 || matches!(ch, '+' | '-')
140 && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit())
141 || matches!(ch, '+' | '-')
142 && self.peek_nth(1).is_some_and(|ch| ch == '.')
143 && self.peek_nth(2).is_some_and(|ch| ch.is_ascii_digit()) =>
144 {
145 Token::Number(self.next_atom())
146 }
147
148 // String
149 '"' => match self.next_string() {
150 Ok(string) => Token::String(string),
151 Err(string) => Token::UnclosedString(string),
152 },
153
154 // Symbol
155 _ => Token::Symbol(self.next_atom()),
156 };
157
158 let end = Pos::new(self.line, self.column, self.cursor);
159 Some(Spanned::new(token, Span::new(start, end)))
160 }
161}
diff --git a/compiler/src/lex/mod.rs b/compiler/src/lex/mod.rs
new file mode 100644
index 0000000..7bc4440
--- /dev/null
+++ b/compiler/src/lex/mod.rs
@@ -0,0 +1,8 @@
1mod lexer;
2mod token;
3
4pub use lexer::Lexer;
5pub use token::Token;
6
7#[cfg(test)]
8mod tests;
diff --git a/compiler/src/lex/tests.rs b/compiler/src/lex/tests.rs
new file mode 100644
index 0000000..2d872a2
--- /dev/null
+++ b/compiler/src/lex/tests.rs
@@ -0,0 +1,383 @@
1use crate::{
2 lex::{Lexer, Token, Token::*},
3 span::Pos,
4};
5
6fn tokenize<'a>(input: &'a str) -> Vec<Token<'a>> {
7 Lexer::new(input).map(|s| s.inner).collect()
8}
9
10#[test]
11fn test_spaces() {
12 let cases = vec![
13 ("", vec![]),
14 (" ", vec![]),
15 ("\n", vec![]),
16 ("\t\n \r\n", vec![]),
17 ];
18 for (code, tokens) in cases {
19 assert_eq!(tokenize(code), tokens);
20 }
21}
22
23#[test]
24fn test_parens() {
25 let cases = vec![
26 ("()", vec![LeftPar, RightPar]),
27 ("( )", vec![LeftPar, RightPar]),
28 ("(())", vec![LeftPar, LeftPar, RightPar, RightPar]),
29 (
30 "((()))",
31 vec![LeftPar, LeftPar, LeftPar, RightPar, RightPar, RightPar],
32 ),
33 (")(", vec![RightPar, LeftPar]),
34 ];
35 for (code, tokens) in cases {
36 assert_eq!(tokenize(code), tokens);
37 }
38}
39
40#[test]
41fn test_quote() {
42 let cases = vec![
43 ("'", vec![Quote]),
44 ("'a", vec![Quote, Symbol("a")]),
45 ("''a", vec![Quote, Quote, Symbol("a")]),
46 ("'()", vec![Quote, LeftPar, RightPar]),
47 (
48 "'(1 2)",
49 vec![Quote, LeftPar, Number("1"), Number("2"), RightPar],
50 ),
51 ("(' )", vec![LeftPar, Quote, RightPar]),
52 ];
53 for (code, tokens) in cases {
54 assert_eq!(tokenize(code), tokens);
55 }
56}
57
58#[test]
59fn test_numbers() {
60 let cases = vec![
61 ("0", vec![Number("0")]),
62 ("42", vec![Number("42")]),
63 ("3.14", vec![Number("3.14")]),
64 ("-7", vec![Number("-7")]),
65 ("+5", vec![Number("+5")]),
66 ("-0.5", vec![Number("-0.5")]),
67 ("1e10", vec![Number("1e10")]),
68 ("1.5e-3", vec![Number("1.5e-3")]),
69 (".5", vec![Number(".5")]),
70 ("-.5", vec![Number("-.5")]),
71 ("+.5", vec![Number("+.5")]),
72 ("-.0", vec![Number("-.0")]),
73 ];
74 for (code, tokens) in cases {
75 assert_eq!(tokenize(code), tokens);
76 }
77}
78
79#[test]
80fn test_strings() {
81 let cases = vec![
82 (r#""""#, vec![String("")]),
83 (r#""hello""#, vec![String("hello")]),
84 (r#""hello world""#, vec![String("hello world")]),
85 (r#""(not a list)""#, vec![String("(not a list)")]),
86 (r#""'not a quote""#, vec![String("'not a quote")]),
87 (r#""; not a comment""#, vec![String("; not a comment")]),
88 (r#"" spaces ""#, vec![String(" spaces ")]),
89 ];
90 for (code, tokens) in cases {
91 assert_eq!(tokenize(code), tokens);
92 }
93}
94
95#[test]
96fn test_string_escapes() {
97 let cases = vec![
98 (r#""line\nbreak""#, vec![String(r"line\nbreak")]),
99 (r#""with \"quotes\"""#, vec![String(r#"with \"quotes\""#)]),
100 (r#""\\""#, vec![String(r"\\")]),
101 ("\"single\\\nline\"", vec![String("single\\\nline")]),
102 ];
103 for (code, tokens) in cases {
104 assert_eq!(tokenize(code), tokens);
105 }
106}
107
108#[test]
109fn test_unclosed_strings() {
110 let cases = vec![
111 (r#""abc"#, vec![UnclosedString("abc")]),
112 (r#""abc\""#, vec![UnclosedString(r#"abc\""#)]),
113 ("\"abc\n", vec![UnclosedString("abc")]),
114 ("\"abc\\\ndef", vec![UnclosedString("abc\\\ndef")]),
115 ("\"abc\n\"def\"", vec![UnclosedString("abc"), String("def")]),
116 (r#"""#, vec![UnclosedString("")]),
117 ("\"\n\"", vec![UnclosedString(""), UnclosedString("")]),
118 ];
119 for (code, tokens) in cases {
120 assert_eq!(tokenize(code), tokens);
121 }
122}
123
124#[test]
125fn test_symbols() {
126 let cases = vec![
127 ("foo", vec![Symbol("foo")]),
128 ("foo-bar", vec![Symbol("foo-bar")]),
129 ("foo!", vec![Symbol("foo!")]),
130 ("empty?", vec![Symbol("empty?")]),
131 ("set!", vec![Symbol("set!")]),
132 ("->", vec![Symbol("->")]),
133 ("+", vec![Symbol("+")]),
134 ("-", vec![Symbol("-")]),
135 ("*", vec![Symbol("*")]),
136 ("/", vec![Symbol("/")]),
137 ("=", vec![Symbol("=")]),
138 ("<=", vec![Symbol("<=")]),
139 (">=", vec![Symbol(">=")]),
140 ("a1b2", vec![Symbol("a1b2")]),
141 ("x", vec![Symbol("x")]),
142 ];
143 for (code, tokens) in cases {
144 assert_eq!(tokenize(code), tokens);
145 }
146}
147
148#[test]
149fn test_ambiguous() {
150 let cases = vec![
151 ("-x", vec![Symbol("-x")]),
152 ("+foo", vec![Symbol("+foo")]),
153 ("...", vec![Symbol("...")]),
154 (".foo", vec![Symbol(".foo")]),
155 ("-.", vec![Symbol("-.")]),
156 ("+.", vec![Symbol("+.")]),
157 (".", vec![Symbol(".")]),
158 ("+.a", vec![Symbol("+.a")]),
159 ("-.a", vec![Symbol("-.a")]),
160 ];
161 for (code, tokens) in cases {
162 assert_eq!(tokenize(code), tokens);
163 }
164}
165
166#[test]
167fn test_no_separators() {
168 let cases = vec![
169 ("(foo)", vec![LeftPar, Symbol("foo"), RightPar]),
170 ("(1)", vec![LeftPar, Number("1"), RightPar]),
171 ("(a)b", vec![LeftPar, Symbol("a"), RightPar, Symbol("b")]),
172 ("'(a)", vec![Quote, LeftPar, Symbol("a"), RightPar]),
173 (r#"("s")"#, vec![LeftPar, String("s"), RightPar]),
174 ];
175 for (code, tokens) in cases {
176 assert_eq!(tokenize(code), tokens);
177 }
178}
179
180#[test]
181fn test_whitespace_separators() {
182 let cases = vec![
183 (
184 "(\n foo\n bar\n)",
185 vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar],
186 ),
187 (
188 "(\tfoo\tbar\t)",
189 vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar],
190 ),
191 ];
192 for (code, tokens) in cases {
193 assert_eq!(tokenize(code), tokens);
194 }
195}
196
197#[test]
198fn test_expressions() {
199 let cases = vec![
200 (
201 "(define x 42)",
202 vec![
203 LeftPar,
204 Symbol("define"),
205 Symbol("x"),
206 Number("42"),
207 RightPar,
208 ],
209 ),
210 (
211 "(+ 1 2)",
212 vec![LeftPar, Symbol("+"), Number("1"), Number("2"), RightPar],
213 ),
214 (
215 "(if (= x 0) 'zero 'nonzero)",
216 vec![
217 LeftPar,
218 Symbol("if"),
219 LeftPar,
220 Symbol("="),
221 Symbol("x"),
222 Number("0"),
223 RightPar,
224 Quote,
225 Symbol("zero"),
226 Quote,
227 Symbol("nonzero"),
228 RightPar,
229 ],
230 ),
231 (
232 r#"(print "hello, world")"#,
233 vec![LeftPar, Symbol("print"), String("hello, world"), RightPar],
234 ),
235 (
236 "(lambda (x) (* x x))",
237 vec![
238 LeftPar,
239 Symbol("lambda"),
240 LeftPar,
241 Symbol("x"),
242 RightPar,
243 LeftPar,
244 Symbol("*"),
245 Symbol("x"),
246 Symbol("x"),
247 RightPar,
248 RightPar,
249 ],
250 ),
251 (
252 "'(1 2 3)",
253 vec![
254 Quote,
255 LeftPar,
256 Number("1"),
257 Number("2"),
258 Number("3"),
259 RightPar,
260 ],
261 ),
262 ];
263 for (code, tokens) in cases {
264 assert_eq!(tokenize(code), tokens);
265 }
266}
267
268#[test]
269fn test_comments() {
270 let cases = vec![
271 (";", vec![]),
272 (";\n", vec![]),
273 ("; comment", vec![]),
274 ("; comment\n", vec![]),
275 ("; comment\n42", vec![Number("42")]),
276 ("42 ; comment", vec![Number("42")]),
277 ("42; comment", vec![Number("42")]),
278 (
279 "(+ 1 2) ; calc\n(- 3 4)",
280 vec![
281 LeftPar,
282 Symbol("+"),
283 Number("1"),
284 Number("2"),
285 RightPar,
286 LeftPar,
287 Symbol("-"),
288 Number("3"),
289 Number("4"),
290 RightPar,
291 ],
292 ),
293 ];
294 for (code, tokens) in cases {
295 assert_eq!(tokenize(code), tokens);
296 }
297}
298
299fn spans(input: &str) -> Vec<(Pos, Pos)> {
300 Lexer::new(input)
301 .map(|s| (s.span.start, s.span.end))
302 .collect()
303}
304
305#[test]
306fn test_span_single_char() {
307 let s = spans("(");
308 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 1, 1))]);
309}
310
311#[test]
312fn test_span_after_leading_whitespace() {
313 let s = spans(" (");
314 assert_eq!(s, vec![(Pos::new(1, 3, 3), Pos::new(1, 4, 4))]);
315}
316
317#[test]
318fn test_span_after_newline() {
319 let s = spans("\n(");
320 assert_eq!(s, vec![(Pos::new(2, 0, 1), Pos::new(2, 1, 2))]);
321}
322
323#[test]
324fn test_span_multi_char() {
325 let s = spans("foo");
326 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 3, 3))]);
327}
328
329#[test]
330fn test_span_string() {
331 let s = spans(r#""hi""#);
332 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 4, 4))]);
333}
334
335#[test]
336fn test_span_sequence() {
337 // (foo 42)
338 // 012345678
339 let s = spans("(foo 42)");
340 assert_eq!(
341 s,
342 vec![
343 (Pos::new(1, 0, 0), Pos::new(1, 1, 1)), // (
344 (Pos::new(1, 1, 1), Pos::new(1, 4, 4)), // foo
345 (Pos::new(1, 5, 5), Pos::new(1, 7, 7)), // 42
346 (Pos::new(1, 7, 7), Pos::new(1, 8, 8)), // )
347 ],
348 );
349}
350
351#[test]
352fn test_span_lines() {
353 let s = spans("foo\nbar");
354 assert_eq!(
355 s,
356 vec![
357 (Pos::new(1, 0, 0), Pos::new(1, 3, 3)),
358 (Pos::new(2, 0, 4), Pos::new(2, 3, 7)),
359 ],
360 );
361}
362
363#[test]
364fn test_span_after_comment() {
365 // ; cm\nfoo
366 // 01234 5678
367 let s = spans("; cm\nfoo");
368 assert_eq!(s, vec![(Pos::new(2, 0, 5), Pos::new(2, 3, 8))]);
369}
370
371#[test]
372fn test_span_after_quote() {
373 // 'hello
374 // 0123456
375 let s = spans("'hello");
376 assert_eq!(
377 s,
378 vec![
379 (Pos::new(1, 0, 0), Pos::new(1, 1, 1)),
380 (Pos::new(1, 1, 1), Pos::new(1, 6, 6))
381 ]
382 );
383}
diff --git a/compiler/src/lex/token.rs b/compiler/src/lex/token.rs
new file mode 100644
index 0000000..2d07885
--- /dev/null
+++ b/compiler/src/lex/token.rs
@@ -0,0 +1,10 @@
1#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2pub enum Token<'a> {
3 LeftPar,
4 RightPar,
5 Quote,
6 Number(&'a str),
7 String(&'a str),
8 UnclosedString(&'a str),
9 Symbol(&'a str),
10}