aboutsummaryrefslogtreecommitdiff
path: root/compiler/src
diff options
context:
space:
mode:
authorTolmachev Igor <me@igorek.dev>2026-05-07 17:46:44 +0300
committerTolmachev Igor <me@igorek.dev>2026-05-07 17:46:44 +0300
commit558c5dcaf7bcc32cfe5672c4113962e3bcd19188 (patch)
tree63c4af6a66e706b3f61cbd9cfcefd57d1e3ab3ce /compiler/src
parent6b5bfd2d33d9aba94eed83ad8b119baa4732f5af (diff)
downloadcrisp-558c5dcaf7bcc32cfe5672c4113962e3bcd19188.tar.gz
crisp-558c5dcaf7bcc32cfe5672c4113962e3bcd19188.zip
Add lexer
Diffstat (limited to 'compiler/src')
-rw-r--r--compiler/src/lexer/error.rs18
-rw-r--r--compiler/src/lexer/mod.rs163
-rw-r--r--compiler/src/lexer/tests.rs372
-rw-r--r--compiler/src/lib.rs3
-rw-r--r--compiler/src/span.rs61
5 files changed, 617 insertions, 0 deletions
diff --git a/compiler/src/lexer/error.rs b/compiler/src/lexer/error.rs
new file mode 100644
index 0000000..f251167
--- /dev/null
+++ b/compiler/src/lexer/error.rs
@@ -0,0 +1,18 @@
1use std::{error, fmt, result};
2
3pub type Result<T> = result::Result<T, Error>;
4
5#[derive(Debug, PartialEq, Eq)]
6pub enum Error {
7 UnclosedString,
8}
9
10impl fmt::Display for Error {
11 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
12 match self {
13 Error::UnclosedString => write!(f, "unclosed string literal"),
14 }
15 }
16}
17
18impl error::Error for Error {}
diff --git a/compiler/src/lexer/mod.rs b/compiler/src/lexer/mod.rs
new file mode 100644
index 0000000..2ef4922
--- /dev/null
+++ b/compiler/src/lexer/mod.rs
@@ -0,0 +1,163 @@
1mod error;
2
3use crate::span::{Pos, Span};
4pub use error::{Error, Result};
5
6#[cfg(test)]
7mod tests;
8
9fn is_terminator(ch: char) -> bool {
10 ch.is_whitespace() || matches!(ch, '(' | ')' | '\'' | '"' | ';')
11}
12
13#[derive(Clone, Copy, Debug, PartialEq, Eq)]
14pub enum Token<'a> {
15 LeftPar,
16 RightPar,
17 Quote,
18 Number(&'a str),
19 String(&'a str),
20 Symbol(&'a str),
21}
22
23pub struct Lexer<'a> {
24 input: &'a str,
25 cursor: usize,
26
27 line: usize,
28 column: usize,
29}
30
31impl<'a> Lexer<'a> {
32 pub fn new(input: &'a str) -> Self {
33 Self {
34 input,
35 cursor: 0,
36
37 line: 1,
38 column: 0,
39 }
40 }
41
42 fn rest(&self) -> &str {
43 &self.input[self.cursor..]
44 }
45
46 fn peek(&self) -> Option<char> {
47 self.rest().chars().next()
48 }
49
50 fn peek_nth(&self, n: usize) -> Option<char> {
51 self.rest().chars().nth(n)
52 }
53
54 fn consume(&mut self) -> Option<char> {
55 let ch = self.peek()?;
56
57 self.cursor += ch.len_utf8();
58 if ch == '\n' {
59 self.line += 1;
60 self.column = 0;
61 } else {
62 self.column += 1;
63 }
64
65 Some(ch)
66 }
67
68 fn next_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str {
69 let start = self.cursor;
70
71 while let Some(ch) = self.peek() {
72 if !predicate(ch) {
73 break;
74 }
75 self.consume();
76 }
77
78 &self.input[start..self.cursor]
79 }
80
81 fn next_atom(&mut self) -> &'a str {
82 self.next_while(|ch| !is_terminator(ch))
83 }
84
85 fn next_string(&mut self) -> Result<&'a str> {
86 debug_assert_eq!(self.peek(), Some('"'));
87 self.consume();
88
89 let start = self.cursor;
90
91 while let Some(ch) = self.peek() {
92 match ch {
93 '"' => {
94 let string = &self.input[start..self.cursor];
95 self.consume();
96 return Ok(string);
97 }
98 '\n' => return Err(Error::UnclosedString),
99 '\\' => {
100 self.consume();
101 self.consume();
102 }
103 _ => {
104 self.consume();
105 }
106 }
107 }
108
109 Err(Error::UnclosedString)
110 }
111}
112
113impl<'a> Iterator for Lexer<'a> {
114 type Item = Span<Result<Token<'a>>>;
115
116 fn next(&mut self) -> Option<Self::Item> {
117 loop {
118 match self.peek()? {
119 ch if ch.is_whitespace() => {
120 self.next_while(char::is_whitespace);
121 }
122 ';' => {
123 self.next_while(|ch| ch != '\n');
124 }
125 _ => break,
126 }
127 }
128
129 let start = Pos::new(self.line, self.column, self.cursor);
130
131 let token = match self.peek()? {
132 '(' => {
133 self.consume();
134 Ok(Token::LeftPar)
135 }
136 ')' => {
137 self.consume();
138 Ok(Token::RightPar)
139 }
140 '\'' => {
141 self.consume();
142 Ok(Token::Quote)
143 }
144
145 // Number
146 ch if ch.is_ascii_digit()
147 || matches!(ch, '+' | '-' | '.')
148 && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit()) =>
149 {
150 Ok(Token::Number(self.next_atom()))
151 }
152
153 // String
154 '"' => self.next_string().map(Token::String),
155
156 // Symbol
157 _ => Ok(Token::Symbol(self.next_atom())),
158 };
159
160 let end = Pos::new(self.line, self.column, self.cursor);
161 Some(Span::new(token, start, end))
162 }
163}
diff --git a/compiler/src/lexer/tests.rs b/compiler/src/lexer/tests.rs
new file mode 100644
index 0000000..65dd2f2
--- /dev/null
+++ b/compiler/src/lexer/tests.rs
@@ -0,0 +1,372 @@
1use crate::span::Pos;
2
3use super::Token::*;
4use super::*;
5
6fn tokenize<'a>(input: &'a str) -> Vec<Token<'a>> {
7 Lexer::new(input).map(|s| s.into_inner().unwrap()).collect()
8}
9
10#[test]
11fn test_spaces() {
12 let cases = vec![
13 ("", vec![]),
14 (" ", vec![]),
15 ("\n", vec![]),
16 ("\t\n \r\n", vec![]),
17 ];
18 for (code, tokens) in cases {
19 assert_eq!(tokenize(code), tokens);
20 }
21}
22
23#[test]
24fn test_parens() {
25 let cases = vec![
26 ("()", vec![LeftPar, RightPar]),
27 ("( )", vec![LeftPar, RightPar]),
28 ("(())", vec![LeftPar, LeftPar, RightPar, RightPar]),
29 (
30 "((()))",
31 vec![LeftPar, LeftPar, LeftPar, RightPar, RightPar, RightPar],
32 ),
33 (")(", vec![RightPar, LeftPar]),
34 ];
35 for (code, tokens) in cases {
36 assert_eq!(tokenize(code), tokens);
37 }
38}
39
40#[test]
41fn test_quote() {
42 let cases = vec![
43 ("'", vec![Quote]),
44 ("'a", vec![Quote, Symbol("a")]),
45 ("''a", vec![Quote, Quote, Symbol("a")]),
46 ("'()", vec![Quote, LeftPar, RightPar]),
47 (
48 "'(1 2)",
49 vec![Quote, LeftPar, Number("1"), Number("2"), RightPar],
50 ),
51 ("(' )", vec![LeftPar, Quote, RightPar]),
52 ];
53 for (code, tokens) in cases {
54 assert_eq!(tokenize(code), tokens);
55 }
56}
57
58#[test]
59fn test_numbers() {
60 let cases = vec![
61 ("0", vec![Number("0")]),
62 ("42", vec![Number("42")]),
63 ("3.14", vec![Number("3.14")]),
64 ("-7", vec![Number("-7")]),
65 ("+5", vec![Number("+5")]),
66 ("-0.5", vec![Number("-0.5")]),
67 ("1e10", vec![Number("1e10")]),
68 ("1.5e-3", vec![Number("1.5e-3")]),
69 (".5", vec![Number(".5")]),
70 ];
71 for (code, tokens) in cases {
72 assert_eq!(tokenize(code), tokens);
73 }
74}
75
76#[test]
77fn test_strings() {
78 let cases = vec![
79 (r#""""#, vec![String("")]),
80 (r#""hello""#, vec![String("hello")]),
81 (r#""hello world""#, vec![String("hello world")]),
82 (r#""(not a list)""#, vec![String("(not a list)")]),
83 (r#""'not a quote""#, vec![String("'not a quote")]),
84 (r#""; not a comment""#, vec![String("; not a comment")]),
85 (r#"" spaces ""#, vec![String(" spaces ")]),
86 ];
87 for (code, tokens) in cases {
88 assert_eq!(tokenize(code), tokens);
89 }
90}
91
92#[test]
93fn test_string_escapes() {
94 let cases = vec![
95 (r#""line\nbreak""#, vec![String(r"line\nbreak")]),
96 (r#""with \"quotes\"""#, vec![String(r#"with \"quotes\""#)]),
97 (r#""\\""#, vec![String(r"\\")]),
98 ("\"single\\\nline\"", vec![String("single\\\nline")]),
99 ];
100 for (code, tokens) in cases {
101 assert_eq!(tokenize(code), tokens);
102 }
103}
104
105#[test]
106fn test_symbols() {
107 let cases = vec![
108 ("foo", vec![Symbol("foo")]),
109 ("foo-bar", vec![Symbol("foo-bar")]),
110 ("foo!", vec![Symbol("foo!")]),
111 ("empty?", vec![Symbol("empty?")]),
112 ("set!", vec![Symbol("set!")]),
113 ("->", vec![Symbol("->")]),
114 ("+", vec![Symbol("+")]),
115 ("-", vec![Symbol("-")]),
116 ("*", vec![Symbol("*")]),
117 ("/", vec![Symbol("/")]),
118 ("=", vec![Symbol("=")]),
119 ("<=", vec![Symbol("<=")]),
120 (">=", vec![Symbol(">=")]),
121 ("a1b2", vec![Symbol("a1b2")]),
122 ("x", vec![Symbol("x")]),
123 ];
124 for (code, tokens) in cases {
125 assert_eq!(tokenize(code), tokens);
126 }
127}
128
129#[test]
130fn test_ambiguous() {
131 let cases = vec![
132 ("-x", vec![Symbol("-x")]),
133 ("+foo", vec![Symbol("+foo")]),
134 ("...", vec![Symbol("...")]),
135 (".foo", vec![Symbol(".foo")]),
136 ];
137 for (code, tokens) in cases {
138 assert_eq!(tokenize(code), tokens);
139 }
140}
141
142#[test]
143fn test_no_separators() {
144 let cases = vec![
145 ("(foo)", vec![LeftPar, Symbol("foo"), RightPar]),
146 ("(1)", vec![LeftPar, Number("1"), RightPar]),
147 ("(a)b", vec![LeftPar, Symbol("a"), RightPar, Symbol("b")]),
148 ("'(a)", vec![Quote, LeftPar, Symbol("a"), RightPar]),
149 (r#"("s")"#, vec![LeftPar, String("s"), RightPar]),
150 ];
151 for (code, tokens) in cases {
152 assert_eq!(tokenize(code), tokens);
153 }
154}
155
156#[test]
157fn test_whitespace_separators() {
158 let cases = vec![
159 (
160 "(\n foo\n bar\n)",
161 vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar],
162 ),
163 (
164 "(\tfoo\tbar\t)",
165 vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar],
166 ),
167 ];
168 for (code, tokens) in cases {
169 assert_eq!(tokenize(code), tokens);
170 }
171}
172
173#[test]
174fn test_expressions() {
175 let cases = vec![
176 (
177 "(define x 42)",
178 vec![
179 LeftPar,
180 Symbol("define"),
181 Symbol("x"),
182 Number("42"),
183 RightPar,
184 ],
185 ),
186 (
187 "(+ 1 2)",
188 vec![LeftPar, Symbol("+"), Number("1"), Number("2"), RightPar],
189 ),
190 (
191 "(if (= x 0) 'zero 'nonzero)",
192 vec![
193 LeftPar,
194 Symbol("if"),
195 LeftPar,
196 Symbol("="),
197 Symbol("x"),
198 Number("0"),
199 RightPar,
200 Quote,
201 Symbol("zero"),
202 Quote,
203 Symbol("nonzero"),
204 RightPar,
205 ],
206 ),
207 (
208 r#"(print "hello, world")"#,
209 vec![LeftPar, Symbol("print"), String("hello, world"), RightPar],
210 ),
211 (
212 "(lambda (x) (* x x))",
213 vec![
214 LeftPar,
215 Symbol("lambda"),
216 LeftPar,
217 Symbol("x"),
218 RightPar,
219 LeftPar,
220 Symbol("*"),
221 Symbol("x"),
222 Symbol("x"),
223 RightPar,
224 RightPar,
225 ],
226 ),
227 (
228 "'(1 2 3)",
229 vec![
230 Quote,
231 LeftPar,
232 Number("1"),
233 Number("2"),
234 Number("3"),
235 RightPar,
236 ],
237 ),
238 ];
239 for (code, tokens) in cases {
240 assert_eq!(tokenize(code), tokens);
241 }
242}
243
244#[test]
245fn test_comments() {
246 let cases = vec![
247 (";", vec![]),
248 (";\n", vec![]),
249 ("; comment", vec![]),
250 ("; comment\n", vec![]),
251 ("; comment\n42", vec![Number("42")]),
252 ("42 ; comment", vec![Number("42")]),
253 ("42; comment", vec![Number("42")]),
254 (
255 "(+ 1 2) ; calc\n(- 3 4)",
256 vec![
257 LeftPar,
258 Symbol("+"),
259 Number("1"),
260 Number("2"),
261 RightPar,
262 LeftPar,
263 Symbol("-"),
264 Number("3"),
265 Number("4"),
266 RightPar,
267 ],
268 ),
269 ];
270 for (code, tokens) in cases {
271 assert_eq!(tokenize(code), tokens);
272 }
273}
274
275fn first_error(input: &str) -> Error {
276 Lexer::new(input)
277 .find_map(|s| s.into_inner().err())
278 .expect("error expected")
279}
280
281#[test]
282fn test_unclosed_string_at_eof() {
283 assert_eq!(first_error(r#""abc"#), Error::UnclosedString);
284 assert_eq!(first_error(r#"""#), Error::UnclosedString);
285}
286
287#[test]
288fn test_unclosed_string_with_trailing_escape() {
289 assert_eq!(first_error("\"abc\\"), Error::UnclosedString);
290}
291
292#[test]
293fn test_unclosed_string_with_newline() {
294 assert_eq!(first_error("\"abc\ndef\""), Error::UnclosedString);
295}
296
297#[test]
298fn test_lexer_stops_after_string_error() {
299 let mut lex = Lexer::new(r#""abc"#);
300 assert!(lex.next().unwrap().into_inner().is_err());
301 assert!(lex.next().is_none());
302}
303
304fn spans(input: &str) -> Vec<(Pos, Pos)> {
305 Lexer::new(input).map(|s| (s.start(), s.end())).collect()
306}
307
308#[test]
309fn test_span_single_char() {
310 let s = spans("(");
311 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 1, 1))]);
312}
313
314#[test]
315fn test_span_after_leading_whitespace() {
316 let s = spans(" (");
317 assert_eq!(s, vec![(Pos::new(1, 3, 3), Pos::new(1, 4, 4))]);
318}
319
320#[test]
321fn test_span_after_newline() {
322 let s = spans("\n(");
323 assert_eq!(s, vec![(Pos::new(2, 0, 1), Pos::new(2, 1, 2))]);
324}
325
326#[test]
327fn test_span_multi_char_() {
328 let s = spans("foo");
329 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 3, 3))]);
330}
331
332#[test]
333fn test_span_string() {
334 let s = spans(r#""hi""#);
335 assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 4, 4))]);
336}
337
338#[test]
339fn test_span_sequence() {
340 // (foo 42)
341 // 012345678
342 let s = spans("(foo 42)");
343 assert_eq!(
344 s,
345 vec![
346 (Pos::new(1, 0, 0), Pos::new(1, 1, 1)), // (
347 (Pos::new(1, 1, 1), Pos::new(1, 4, 4)), // foo
348 (Pos::new(1, 5, 5), Pos::new(1, 7, 7)), // 42
349 (Pos::new(1, 7, 7), Pos::new(1, 8, 8)), // )
350 ],
351 );
352}
353
354#[test]
355fn test_span_lines() {
356 let s = spans("foo\nbar");
357 assert_eq!(
358 s,
359 vec![
360 (Pos::new(1, 0, 0), Pos::new(1, 3, 3)),
361 (Pos::new(2, 0, 4), Pos::new(2, 3, 7)),
362 ],
363 );
364}
365
366#[test]
367fn test_span_after_comment() {
368 // ; cm\nfoo
369 // 01234 5678
370 let s = spans("; cm\nfoo");
371 assert_eq!(s, vec![(Pos::new(2, 0, 5), Pos::new(2, 3, 8))]);
372}
diff --git a/compiler/src/lib.rs b/compiler/src/lib.rs
new file mode 100644
index 0000000..b9b7a46
--- /dev/null
+++ b/compiler/src/lib.rs
@@ -0,0 +1,3 @@
1pub mod ast;
2pub mod lexer;
3pub mod span;
diff --git a/compiler/src/span.rs b/compiler/src/span.rs
new file mode 100644
index 0000000..0644c1c
--- /dev/null
+++ b/compiler/src/span.rs
@@ -0,0 +1,61 @@
1#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2pub struct Pos {
3 line: usize,
4 column: usize,
5 cursor: usize,
6}
7
8impl Pos {
9 pub fn new(line: usize, column: usize, cursor: usize) -> Self {
10 Self {
11 line,
12 column,
13 cursor,
14 }
15 }
16
17 pub fn line(self) -> usize {
18 self.line
19 }
20
21 pub fn column(self) -> usize {
22 self.column
23 }
24
25 pub fn cursor(self) -> usize {
26 self.cursor
27 }
28}
29
30#[derive(Clone, Debug)]
31pub struct Span<T> {
32 inner: T,
33 start: Pos,
34 end: Pos,
35}
36
37impl<T> Span<T> {
38 pub fn new(inner: T, start: Pos, end: Pos) -> Self {
39 Self { inner, start, end }
40 }
41
42 pub fn inner(&self) -> &T {
43 &self.inner
44 }
45
46 pub fn inner_mut(&mut self) -> &mut T {
47 &mut self.inner
48 }
49
50 pub fn into_inner(self) -> T {
51 self.inner
52 }
53
54 pub fn start(&self) -> Pos {
55 self.start
56 }
57
58 pub fn end(&self) -> Pos {
59 self.end
60 }
61}