From 558c5dcaf7bcc32cfe5672c4113962e3bcd19188 Mon Sep 17 00:00:00 2001 From: Tolmachev Igor Date: Thu, 7 May 2026 17:46:44 +0300 Subject: Add lexer --- compiler/src/lexer/error.rs | 18 +++ compiler/src/lexer/mod.rs | 163 +++++++++++++++++++ compiler/src/lexer/tests.rs | 372 ++++++++++++++++++++++++++++++++++++++++++++ compiler/src/lib.rs | 3 + compiler/src/span.rs | 61 ++++++++ 5 files changed, 617 insertions(+) create mode 100644 compiler/src/lexer/error.rs create mode 100644 compiler/src/lexer/mod.rs create mode 100644 compiler/src/lexer/tests.rs create mode 100644 compiler/src/lib.rs create mode 100644 compiler/src/span.rs diff --git a/compiler/src/lexer/error.rs b/compiler/src/lexer/error.rs new file mode 100644 index 0000000..f251167 --- /dev/null +++ b/compiler/src/lexer/error.rs @@ -0,0 +1,18 @@ +use std::{error, fmt, result}; + +pub type Result = result::Result; + +#[derive(Debug, PartialEq, Eq)] +pub enum Error { + UnclosedString, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Error::UnclosedString => write!(f, "unclosed string literal"), + } + } +} + +impl error::Error for Error {} diff --git a/compiler/src/lexer/mod.rs b/compiler/src/lexer/mod.rs new file mode 100644 index 0000000..2ef4922 --- /dev/null +++ b/compiler/src/lexer/mod.rs @@ -0,0 +1,163 @@ +mod error; + +use crate::span::{Pos, Span}; +pub use error::{Error, Result}; + +#[cfg(test)] +mod tests; + +fn is_terminator(ch: char) -> bool { + ch.is_whitespace() || matches!(ch, '(' | ')' | '\'' | '"' | ';') +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Token<'a> { + LeftPar, + RightPar, + Quote, + Number(&'a str), + String(&'a str), + Symbol(&'a str), +} + +pub struct Lexer<'a> { + input: &'a str, + cursor: usize, + + line: usize, + column: usize, +} + +impl<'a> Lexer<'a> { + pub fn new(input: &'a str) -> Self { + Self { + input, + cursor: 0, + + line: 1, + column: 0, + } + } + + fn rest(&self) -> &str { + &self.input[self.cursor..] + } + + fn peek(&self) -> Option { + self.rest().chars().next() + } + + fn peek_nth(&self, n: usize) -> Option { + self.rest().chars().nth(n) + } + + fn consume(&mut self) -> Option { + let ch = self.peek()?; + + self.cursor += ch.len_utf8(); + if ch == '\n' { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + + Some(ch) + } + + fn next_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str { + let start = self.cursor; + + while let Some(ch) = self.peek() { + if !predicate(ch) { + break; + } + self.consume(); + } + + &self.input[start..self.cursor] + } + + fn next_atom(&mut self) -> &'a str { + self.next_while(|ch| !is_terminator(ch)) + } + + fn next_string(&mut self) -> Result<&'a str> { + debug_assert_eq!(self.peek(), Some('"')); + self.consume(); + + let start = self.cursor; + + while let Some(ch) = self.peek() { + match ch { + '"' => { + let string = &self.input[start..self.cursor]; + self.consume(); + return Ok(string); + } + '\n' => return Err(Error::UnclosedString), + '\\' => { + self.consume(); + self.consume(); + } + _ => { + self.consume(); + } + } + } + + Err(Error::UnclosedString) + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Span>>; + + fn next(&mut self) -> Option { + loop { + match self.peek()? { + ch if ch.is_whitespace() => { + self.next_while(char::is_whitespace); + } + ';' => { + self.next_while(|ch| ch != '\n'); + } + _ => break, + } + } + + let start = Pos::new(self.line, self.column, self.cursor); + + let token = match self.peek()? { + '(' => { + self.consume(); + Ok(Token::LeftPar) + } + ')' => { + self.consume(); + Ok(Token::RightPar) + } + '\'' => { + self.consume(); + Ok(Token::Quote) + } + + // Number + ch if ch.is_ascii_digit() + || matches!(ch, '+' | '-' | '.') + && self.peek_nth(1).is_some_and(|ch| ch.is_ascii_digit()) => + { + Ok(Token::Number(self.next_atom())) + } + + // String + '"' => self.next_string().map(Token::String), + + // Symbol + _ => Ok(Token::Symbol(self.next_atom())), + }; + + let end = Pos::new(self.line, self.column, self.cursor); + Some(Span::new(token, start, end)) + } +} diff --git a/compiler/src/lexer/tests.rs b/compiler/src/lexer/tests.rs new file mode 100644 index 0000000..65dd2f2 --- /dev/null +++ b/compiler/src/lexer/tests.rs @@ -0,0 +1,372 @@ +use crate::span::Pos; + +use super::Token::*; +use super::*; + +fn tokenize<'a>(input: &'a str) -> Vec> { + Lexer::new(input).map(|s| s.into_inner().unwrap()).collect() +} + +#[test] +fn test_spaces() { + let cases = vec![ + ("", vec![]), + (" ", vec![]), + ("\n", vec![]), + ("\t\n \r\n", vec![]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_parens() { + let cases = vec![ + ("()", vec![LeftPar, RightPar]), + ("( )", vec![LeftPar, RightPar]), + ("(())", vec![LeftPar, LeftPar, RightPar, RightPar]), + ( + "((()))", + vec![LeftPar, LeftPar, LeftPar, RightPar, RightPar, RightPar], + ), + (")(", vec![RightPar, LeftPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_quote() { + let cases = vec![ + ("'", vec![Quote]), + ("'a", vec![Quote, Symbol("a")]), + ("''a", vec![Quote, Quote, Symbol("a")]), + ("'()", vec![Quote, LeftPar, RightPar]), + ( + "'(1 2)", + vec![Quote, LeftPar, Number("1"), Number("2"), RightPar], + ), + ("(' )", vec![LeftPar, Quote, RightPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_numbers() { + let cases = vec![ + ("0", vec![Number("0")]), + ("42", vec![Number("42")]), + ("3.14", vec![Number("3.14")]), + ("-7", vec![Number("-7")]), + ("+5", vec![Number("+5")]), + ("-0.5", vec![Number("-0.5")]), + ("1e10", vec![Number("1e10")]), + ("1.5e-3", vec![Number("1.5e-3")]), + (".5", vec![Number(".5")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_strings() { + let cases = vec![ + (r#""""#, vec![String("")]), + (r#""hello""#, vec![String("hello")]), + (r#""hello world""#, vec![String("hello world")]), + (r#""(not a list)""#, vec![String("(not a list)")]), + (r#""'not a quote""#, vec![String("'not a quote")]), + (r#""; not a comment""#, vec![String("; not a comment")]), + (r#"" spaces ""#, vec![String(" spaces ")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_string_escapes() { + let cases = vec![ + (r#""line\nbreak""#, vec![String(r"line\nbreak")]), + (r#""with \"quotes\"""#, vec![String(r#"with \"quotes\""#)]), + (r#""\\""#, vec![String(r"\\")]), + ("\"single\\\nline\"", vec![String("single\\\nline")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_symbols() { + let cases = vec![ + ("foo", vec![Symbol("foo")]), + ("foo-bar", vec![Symbol("foo-bar")]), + ("foo!", vec![Symbol("foo!")]), + ("empty?", vec![Symbol("empty?")]), + ("set!", vec![Symbol("set!")]), + ("->", vec![Symbol("->")]), + ("+", vec![Symbol("+")]), + ("-", vec![Symbol("-")]), + ("*", vec![Symbol("*")]), + ("/", vec![Symbol("/")]), + ("=", vec![Symbol("=")]), + ("<=", vec![Symbol("<=")]), + (">=", vec![Symbol(">=")]), + ("a1b2", vec![Symbol("a1b2")]), + ("x", vec![Symbol("x")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_ambiguous() { + let cases = vec![ + ("-x", vec![Symbol("-x")]), + ("+foo", vec![Symbol("+foo")]), + ("...", vec![Symbol("...")]), + (".foo", vec![Symbol(".foo")]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_no_separators() { + let cases = vec![ + ("(foo)", vec![LeftPar, Symbol("foo"), RightPar]), + ("(1)", vec![LeftPar, Number("1"), RightPar]), + ("(a)b", vec![LeftPar, Symbol("a"), RightPar, Symbol("b")]), + ("'(a)", vec![Quote, LeftPar, Symbol("a"), RightPar]), + (r#"("s")"#, vec![LeftPar, String("s"), RightPar]), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_whitespace_separators() { + let cases = vec![ + ( + "(\n foo\n bar\n)", + vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar], + ), + ( + "(\tfoo\tbar\t)", + vec![LeftPar, Symbol("foo"), Symbol("bar"), RightPar], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_expressions() { + let cases = vec![ + ( + "(define x 42)", + vec![ + LeftPar, + Symbol("define"), + Symbol("x"), + Number("42"), + RightPar, + ], + ), + ( + "(+ 1 2)", + vec![LeftPar, Symbol("+"), Number("1"), Number("2"), RightPar], + ), + ( + "(if (= x 0) 'zero 'nonzero)", + vec![ + LeftPar, + Symbol("if"), + LeftPar, + Symbol("="), + Symbol("x"), + Number("0"), + RightPar, + Quote, + Symbol("zero"), + Quote, + Symbol("nonzero"), + RightPar, + ], + ), + ( + r#"(print "hello, world")"#, + vec![LeftPar, Symbol("print"), String("hello, world"), RightPar], + ), + ( + "(lambda (x) (* x x))", + vec![ + LeftPar, + Symbol("lambda"), + LeftPar, + Symbol("x"), + RightPar, + LeftPar, + Symbol("*"), + Symbol("x"), + Symbol("x"), + RightPar, + RightPar, + ], + ), + ( + "'(1 2 3)", + vec![ + Quote, + LeftPar, + Number("1"), + Number("2"), + Number("3"), + RightPar, + ], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +#[test] +fn test_comments() { + let cases = vec![ + (";", vec![]), + (";\n", vec![]), + ("; comment", vec![]), + ("; comment\n", vec![]), + ("; comment\n42", vec![Number("42")]), + ("42 ; comment", vec![Number("42")]), + ("42; comment", vec![Number("42")]), + ( + "(+ 1 2) ; calc\n(- 3 4)", + vec![ + LeftPar, + Symbol("+"), + Number("1"), + Number("2"), + RightPar, + LeftPar, + Symbol("-"), + Number("3"), + Number("4"), + RightPar, + ], + ), + ]; + for (code, tokens) in cases { + assert_eq!(tokenize(code), tokens); + } +} + +fn first_error(input: &str) -> Error { + Lexer::new(input) + .find_map(|s| s.into_inner().err()) + .expect("error expected") +} + +#[test] +fn test_unclosed_string_at_eof() { + assert_eq!(first_error(r#""abc"#), Error::UnclosedString); + assert_eq!(first_error(r#"""#), Error::UnclosedString); +} + +#[test] +fn test_unclosed_string_with_trailing_escape() { + assert_eq!(first_error("\"abc\\"), Error::UnclosedString); +} + +#[test] +fn test_unclosed_string_with_newline() { + assert_eq!(first_error("\"abc\ndef\""), Error::UnclosedString); +} + +#[test] +fn test_lexer_stops_after_string_error() { + let mut lex = Lexer::new(r#""abc"#); + assert!(lex.next().unwrap().into_inner().is_err()); + assert!(lex.next().is_none()); +} + +fn spans(input: &str) -> Vec<(Pos, Pos)> { + Lexer::new(input).map(|s| (s.start(), s.end())).collect() +} + +#[test] +fn test_span_single_char() { + let s = spans("("); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 1, 1))]); +} + +#[test] +fn test_span_after_leading_whitespace() { + let s = spans(" ("); + assert_eq!(s, vec![(Pos::new(1, 3, 3), Pos::new(1, 4, 4))]); +} + +#[test] +fn test_span_after_newline() { + let s = spans("\n("); + assert_eq!(s, vec![(Pos::new(2, 0, 1), Pos::new(2, 1, 2))]); +} + +#[test] +fn test_span_multi_char_() { + let s = spans("foo"); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 3, 3))]); +} + +#[test] +fn test_span_string() { + let s = spans(r#""hi""#); + assert_eq!(s, vec![(Pos::new(1, 0, 0), Pos::new(1, 4, 4))]); +} + +#[test] +fn test_span_sequence() { + // (foo 42) + // 012345678 + let s = spans("(foo 42)"); + assert_eq!( + s, + vec![ + (Pos::new(1, 0, 0), Pos::new(1, 1, 1)), // ( + (Pos::new(1, 1, 1), Pos::new(1, 4, 4)), // foo + (Pos::new(1, 5, 5), Pos::new(1, 7, 7)), // 42 + (Pos::new(1, 7, 7), Pos::new(1, 8, 8)), // ) + ], + ); +} + +#[test] +fn test_span_lines() { + let s = spans("foo\nbar"); + assert_eq!( + s, + vec![ + (Pos::new(1, 0, 0), Pos::new(1, 3, 3)), + (Pos::new(2, 0, 4), Pos::new(2, 3, 7)), + ], + ); +} + +#[test] +fn test_span_after_comment() { + // ; cm\nfoo + // 01234 5678 + let s = spans("; cm\nfoo"); + assert_eq!(s, vec![(Pos::new(2, 0, 5), Pos::new(2, 3, 8))]); +} diff --git a/compiler/src/lib.rs b/compiler/src/lib.rs new file mode 100644 index 0000000..b9b7a46 --- /dev/null +++ b/compiler/src/lib.rs @@ -0,0 +1,3 @@ +pub mod ast; +pub mod lexer; +pub mod span; diff --git a/compiler/src/span.rs b/compiler/src/span.rs new file mode 100644 index 0000000..0644c1c --- /dev/null +++ b/compiler/src/span.rs @@ -0,0 +1,61 @@ +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Pos { + line: usize, + column: usize, + cursor: usize, +} + +impl Pos { + pub fn new(line: usize, column: usize, cursor: usize) -> Self { + Self { + line, + column, + cursor, + } + } + + pub fn line(self) -> usize { + self.line + } + + pub fn column(self) -> usize { + self.column + } + + pub fn cursor(self) -> usize { + self.cursor + } +} + +#[derive(Clone, Debug)] +pub struct Span { + inner: T, + start: Pos, + end: Pos, +} + +impl Span { + pub fn new(inner: T, start: Pos, end: Pos) -> Self { + Self { inner, start, end } + } + + pub fn inner(&self) -> &T { + &self.inner + } + + pub fn inner_mut(&mut self) -> &mut T { + &mut self.inner + } + + pub fn into_inner(self) -> T { + self.inner + } + + pub fn start(&self) -> Pos { + self.start + } + + pub fn end(&self) -> Pos { + self.end + } +} -- cgit v1.3