From 160b64427d79290a59ac48c9babca064232d8dfd Mon Sep 17 00:00:00 2001 From: Tolmachev Igor Date: Sat, 9 May 2026 20:47:04 +0300 Subject: Make project structure more consistent --- compiler/src/lex/lexer.rs | 161 +++++++++++++++++++ compiler/src/lex/mod.rs | 8 + compiler/src/lex/tests.rs | 383 ++++++++++++++++++++++++++++++++++++++++++++++ compiler/src/lex/token.rs | 10 ++ 4 files changed, 562 insertions(+) create mode 100644 compiler/src/lex/lexer.rs create mode 100644 compiler/src/lex/mod.rs create mode 100644 compiler/src/lex/tests.rs create mode 100644 compiler/src/lex/token.rs (limited to 'compiler/src/lex') diff --git a/compiler/src/lex/lexer.rs b/compiler/src/lex/lexer.rs new file mode 100644 index 0000000..801d382 --- /dev/null +++ b/compiler/src/lex/lexer.rs @@ -0,0 +1,161 @@ +use crate::{ + lex::Token, + span::{Pos, Span, Spanned}, +}; + +fn is_terminator(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, '(' | ')' | '\'' | '"' | ';') +} + +pub struct Lexer<'a> { + input: &'a str, + cursor: usize, + + line: usize, + column: usize, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Self { + Self { + input, + cursor: 0, + + line: 1, + column: 0, + } + } + + fn rest(&self) -> &str { + &self.input[self.cursor..] + } + + fn peek(&self) -> Option { + self.rest().chars().next() + } + + fn peek_nth(&self, n: usize) -> Option { + self.rest().chars().nth(n) + } + + fn consume(&mut self) -> Option { + let ch = self.peek()?; + + self.cursor += ch.len_utf8(); + if ch == '\n' { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + + Some(ch) + } + + fn next_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str { + let start = self.cursor; + + while let Some(ch) = self.peek() { + if !predicate(ch) { + break; + } + self.consume(); + } + + &self.input[start..self.cursor] + } + + fn next_atom(&mut self) -> &'a str { + self.next_while(|ch| !is_terminator(ch)) + } + + fn next_string(&mut self) -> Result<&'a str, &'a str> { + debug_assert_eq!(self.peek(), Some('"')); + self.consume(); + + let start = self.cursor; + + while let Some(ch) = self.peek() { + match ch { + '"' => { + let string = &self.input[start..self.cursor]; + self.consume(); + return Ok(string); + } + '\n' => { + let string = &self.input[start..self.cursor]; + self.consume(); + return Err(string); + } + '\\' => { + self.consume(); + self.consume(); + } + _ => { + self.consume(); + } + } + } + + Err(&self.input[start..self.cursor]) + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Spanned>; + + fn next(&mut self) -> Option { + loop { + match self.peek()? { + ch if ch.is_whitespace() => { + self.next_while(char::is_whitespace); + } + ';' => { + self.next_while(|ch| ch != '\n'); + } + _ => break, + } + } + + let start = Pos::new(self.line, self.column, self.cursor); + + let token = match self.peek()? { + '(' => { + self.consume(); + Token::LeftPar + } + ')' => { + self.consume(); + Token::RightPar + } + '\'' => { + self.consume(); + Token::Quote + } + + // Number + ch if ch.is_ascii_digit() + || ch == '.' && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit()) + || matches!(ch, '+' | '-') + && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit()) + || matches!(ch, '+' | '-') + && self.peek_nth(1).is_some_and(|ch| ch == '.') + && self.peek_nth(2).is_some_and(|ch| ch.is_ascii_digit()) => + { + Token::Number(self.next_atom()) + } + + // String + '"' => match self.next_string() { + Ok(string) => Token::String(string), + Err(string) => Token::UnclosedString(string), + }, + + // Symbol + _ => Token::Symbol(self.next_atom()), + }; + + let end = Pos::new(self.line, self.column, self.cursor); + Some(Spanned::new(token, Span::new(start, end))) + } +} diff --git a/compiler/src/lex/mod.rs b/compiler/src/lex/mod.rs new file mode 100644 index 0000000..7bc4440 --- /dev/null +++ b/compiler/src/lex/mod.rs @@ -0,0 +1,8 @@ +mod lexer; +mod token; + +pub use lexer::Lexer; +pub use token::Token; + +#[cfg(test)] +mod tests; diff --git a/compiler/src/lex/tests.rs b/compiler/src/lex/tests.rs new file mode 100644 index 0000000..2d872a2 --- /dev/null +++ b/compiler/src/lex/tests.rs @@ -0,0 +1,383 @@ +use crate::{ + lex::{Lexer, Token, Token::*}, + span::Pos, +}; + +fn tokenize<'a>(input: &'a str) -> Vec> { + Lexer::new(input).map(|s| s.inner).collect() +} + +#[test] +fn test_spaces() { + let cases = vec![ + ("", vec![]), + (" ", vec![]), + ("\n", vec![]), + ("\t\n \r\n", vec![]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_parens() { + let cases = vec![ + ("()", vec![LeftPar, RightPar]), + ("( )", vec![LeftPar, RightPar]), + ("(())", vec![LeftPar, LeftPar, RightPar, RightPar]), + ( + "((()))", + vec![LeftPar, LeftPar, LeftPar, RightPar, RightPar, RightPar], + ), + (")(", vec![RightPar, LeftPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_quote() { + let cases = vec![ + ("'", vec![Quote]), + ("'a", vec![Quote, Symbol("a")]), + ("''a", vec![Quote, Quote, Symbol("a")]), + ("'()", vec![Quote, LeftPar, RightPar]), + ( + "'(1 2)", + vec![Quote, LeftPar, Number("1"), Number("2"), RightPar], + ), + ("(' )", vec![LeftPar, Quote, RightPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_numbers() { + let cases = vec![ + ("0", vec![Number("0")]), + ("42", vec![Number("42")]), + ("3.14", vec![Number("3.14")]), + ("-7", vec![Number("-7")]), + ("+5", vec![Number("+5")]), + ("-0.5", vec![Number("-0.5")]), + ("1e10", vec![Number("1e10")]), + ("1.5e-3", vec![Number("1.5e-3")]), + (".5", vec![Number(".5")]), + ("-.5", vec![Number("-.5")]), + ("+.5", vec![Number("+.5")]), + ("-.0", vec![Number("-.0")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_strings() { + let cases = vec![ + (r#""""#, vec![String("")]), + (r#""hello""#, vec![String("hello")]), + (r#""hello world""#, vec![String("hello world")]), + (r#""(not a list)""#, vec![String("(not a list)")]), + (r#""'not a quote""#, vec![String("'not a quote")]), + (r#""; not a comment""#, vec![String("; not a comment")]), + (r#"" spaces ""#, vec![String(" spaces ")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_string_escapes() { + let cases = vec![ + (r#""line\nbreak""#, vec![String(r"line\nbreak")]), + (r#""with \"quotes\"""#, vec![String(r#"with \"quotes\""#)]), + (r#""\\""#, vec![String(r"\\")]), + ("\"single\\\nline\"", vec![String("single\\\nline")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_unclosed_strings() { + let cases = vec![ + (r#""abc"#, vec![UnclosedString("abc")]), + (r#""abc\""#, vec![UnclosedString(r#"abc\""#)]), + ("\"abc\n", vec![UnclosedString("abc")]), + ("\"abc\\\ndef", vec![UnclosedString("abc\\\ndef")]), + ("\"abc\n\"def\"", vec![UnclosedString("abc"), String("def")]), + (r#"""#, vec![UnclosedString("")]), + ("\"\n\"", vec![UnclosedString(""), UnclosedString("")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_symbols() { + let cases = vec![ + ("foo", vec![Symbol("foo")]), + ("foo-bar", vec![Symbol("foo-bar")]), + ("foo!", vec![Symbol("foo!")]), + ("empty?", vec![Symbol("empty?")]), + ("set!", vec![Symbol("set!")]), + ("->", vec![Symbol("->")]), + ("+", vec![Symbol("+")]), + ("-", vec![Symbol("-")]), + ("*", vec![Symbol("*")]), + ("/", vec![Symbol("/")]), + ("=", vec![Symbol("=")]), + ("<=", vec![Symbol("<=")]), + (">=", vec![Symbol(">=")]), + ("a1b2", vec![Symbol("a1b2")]), + ("x", vec![Symbol("x")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_ambiguous() { + let cases = vec![ + ("-x", vec![Symbol("-x")]), + ("+foo", vec![Symbol("+foo")]), + ("...", vec![Symbol("...")]), + (".foo", vec![Symbol(".foo")]), + ("-.", vec![Symbol("-.")]), + ("+.", vec![Symbol("+.")]), + (".", vec![Symbol(".")]), + ("+.a", vec![Symbol("+.a")]), + ("-.a", vec![Symbol("-.a")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_no_separators() { + let cases = vec![ + ("(foo)", vec![LeftPar, Symbol("foo"), RightPar]), + ("(1)", vec![LeftPar, Number("1"), RightPar]), + ("(a)b", vec![LeftPar, Symbol("a"), RightPar, Symbol("b")]), + ("'(a)", vec![Quote, LeftPar, Symbol("a"), RightPar]), + (r#"("s")"#, vec![LeftPar, String("s"), RightPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_whitespace_separators() { + let cases = vec![ + ( + "(\n foo\n bar\n)", + vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar], + ), + ( + "(\tfoo\tbar\t)", + vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_expressions() { + let cases = vec![ + ( + "(define x 42)", + vec![ + LeftPar, + Symbol("define"), + Symbol("x"), + Number("42"), + RightPar, + ], + ), + ( + "(+ 1 2)", + vec![LeftPar, Symbol("+"), Number("1"), Number("2"), RightPar], + ), + ( + "(if (= x 0) 'zero 'nonzero)", + vec![ + LeftPar, + Symbol("if"), + LeftPar, + Symbol("="), + Symbol("x"), + Number("0"), + RightPar, + Quote, + Symbol("zero"), + Quote, + Symbol("nonzero"), + RightPar, + ], + ), + ( + r#"(print "hello, world")"#, + vec![LeftPar, Symbol("print"), String("hello, world"), RightPar], + ), + ( + "(lambda (x) (* x x))", + vec![ + LeftPar, + Symbol("lambda"), + LeftPar, + Symbol("x"), + RightPar, + LeftPar, + Symbol("*"), + Symbol("x"), + Symbol("x"), + RightPar, + RightPar, + ], + ), + ( + "'(1 2 3)", + vec![ + Quote, + LeftPar, + Number("1"), + Number("2"), + Number("3"), + RightPar, + ], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_comments() { + let cases = vec![ + (";", vec![]), + (";\n", vec![]), + ("; comment", vec![]), + ("; comment\n", vec![]), + ("; comment\n42", vec![Number("42")]), + ("42 ; comment", vec![Number("42")]), + ("42; comment", vec![Number("42")]), + ( + "(+ 1 2) ; calc\n(- 3 4)", + vec![ + LeftPar, + Symbol("+"), + Number("1"), + Number("2"), + RightPar, + LeftPar, + Symbol("-"), + Number("3"), + Number("4"), + RightPar, + ], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +fn spans(input: &str) -> Vec<(Pos, Pos)> { + Lexer::new(input) + .map(|s| (s.span.start, s.span.end)) + .collect() +} + +#[test] +fn test_span_single_char() { + let s = spans("("); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 1, 1))]); +} + +#[test] +fn test_span_after_leading_whitespace() { + let s = spans(" ("); + assert_eq!(s, vec![(Pos::new(1, 3, 3), Pos::new(1, 4, 4))]); +} + +#[test] +fn test_span_after_newline() { + let s = spans("\n("); + assert_eq!(s, vec![(Pos::new(2, 0, 1), Pos::new(2, 1, 2))]); +} + +#[test] +fn test_span_multi_char() { + let s = spans("foo"); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 3, 3))]); +} + +#[test] +fn test_span_string() { + let s = spans(r#""hi""#); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 4, 4))]); +} + +#[test] +fn test_span_sequence() { + // (foo 42) + // 012345678 + let s = spans("(foo 42)"); + assert_eq!( + s, + vec![ + (Pos::new(1, 0, 0), Pos::new(1, 1, 1)), // ( + (Pos::new(1, 1, 1), Pos::new(1, 4, 4)), // foo + (Pos::new(1, 5, 5), Pos::new(1, 7, 7)), // 42 + (Pos::new(1, 7, 7), Pos::new(1, 8, 8)), // ) + ], + ); +} + +#[test] +fn test_span_lines() { + let s = spans("foo\nbar"); + assert_eq!( + s, + vec![ + (Pos::new(1, 0, 0), Pos::new(1, 3, 3)), + (Pos::new(2, 0, 4), Pos::new(2, 3, 7)), + ], + ); +} + +#[test] +fn test_span_after_comment() { + // ; cm\nfoo + // 01234 5678 + let s = spans("; cm\nfoo"); + assert_eq!(s, vec![(Pos::new(2, 0, 5), Pos::new(2, 3, 8))]); +} + +#[test] +fn test_span_after_quote() { + // 'hello + // 0123456 + let s = spans("'hello"); + assert_eq!( + s, + vec![ + (Pos::new(1, 0, 0), Pos::new(1, 1, 1)), + (Pos::new(1, 1, 1), Pos::new(1, 6, 6)) + ] + ); +} diff --git a/compiler/src/lex/token.rs b/compiler/src/lex/token.rs new file mode 100644 index 0000000..2d07885 --- /dev/null +++ b/compiler/src/lex/token.rs @@ -0,0 +1,10 @@ +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Token<'a> { + LeftPar, + RightPar, + Quote, + Number(&'a str), + String(&'a str), + UnclosedString(&'a str), + Symbol(&'a str), +} -- cgit v1.3