From 96db6e88ab59cf267dca9f293b53e734bdfc2fb4 Mon Sep 17 00:00:00 2001 From: Caleb Jones Date: Sat, 24 Dec 2016 15:33:14 -0500 Subject: [PATCH] Begin a reimplementation that tracks source locations --- Cargo.toml | 1 - src/lib.rs | 491 ++++++++++++++++++++++++++++++----------------------- 2 files changed, 278 insertions(+), 214 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2d769b5..1acdc83 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,4 +4,3 @@ version = "0.2.0" authors = ["Caleb Jones "] [dependencies] -nom = "^2.0" diff --git a/src/lib.rs b/src/lib.rs index b82be20..2d83e75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,253 +3,318 @@ // #![warn(missing_docs)] #![deny(unsafe_code)] -#[macro_use] -extern crate nom; - -use nom::{digit, multispace, IResult}; -use std::str::FromStr; - -/// Indicates how parsing failed. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ParseError { - /// We can't explain how the parsing failed. - Unspecified, -} +use std::borrow::Cow; +/// A type representing arbitrary symbolic expressions. `Sexp` carries the +/// source code location it came from along with it for later diagnostic +/// purposes. #[derive(Debug, PartialEq, Clone, PartialOrd)] -pub enum Sexp { +pub enum Sexp<'a, Loc=ByteSpan> where Loc: Span { /// A value representing a symbol. A symbol is an atomic unit - Sym(String), + Sym(Cow<'a, str>, Loc), /// A value representing a string literal. - Str(String), + Str(Cow<'a, str>, Loc), /// A value representing a single character. - Char(char), + Char(char, Loc), /// A value representing an integer. Any number containing no decimal point /// will be parsed as an `Int`. - Int(i64), + Int(i64, Loc), /// A value representing a float. Any number containing a decimal point will /// be parsed as a `Float`. - Float(f64), + Float(f64, Loc), /// A list of subexpressions - List(Vec), + List(Vec>, Loc), } -pub fn parse_one(input: &str) -> Result { - match do_parse!(input, - exp: sexp >> - opt!(complete!(multispace)) >> - eof!() >> - (exp)) { - IResult::Done(_, res) => Ok(res), - _ => Err(ParseError::Unspecified), +impl<'a, Loc> Sexp<'a, Loc> where Loc: Span { + pub fn get_loc(&self) -> &Loc { + match *self { + Sexp::Sym(.., ref l) => l, + Sexp::Str(.., ref l) => l, + Sexp::Char(.., ref l) => l, + Sexp::Int(.., ref l) => l, + Sexp::Float(.., ref l) => l, + Sexp::List(.., ref l) => l, + } } -} -pub fn parse(input: &str) -> Result, ParseError> { - let parse_res: IResult<&str, Vec> = - do_parse!(input, - exps: many1!(complete!(sexp)) >> - opt!(complete!(multispace)) >> - eof!() >> - (exps)); - match parse_res { - IResult::Done(_, res) => Ok(res), - e => { - println!("{:#?}", e); - Err(ParseError::Unspecified) + pub fn get_loc_mut(&mut self) -> &mut Loc { + match *self { + Sexp::Sym(.., ref mut l) => l, + Sexp::Str(.., ref mut l) => l, + Sexp::Char(.., ref mut l) => l, + Sexp::Int(.., ref mut l) => l, + Sexp::Float(.., ref mut l) => l, + Sexp::List(.., ref mut l) => l, } } } -named!(sexp<&str, Sexp>, - alt_complete!( - list => { |list| Sexp::List(list) } - | atom - ) -); - -named!(list<&str, Vec >, - do_parse!( - opt!(multispace) >> - tag_s!("(") >> - entries: many0!(sexp) >> - opt!(multispace) >> - tag_s!(")") >> - (entries) - ) -); - -named!(atom<&str, Sexp>, alt_complete!(string | symbol | number | character)); - -named!(string<&str, Sexp>, - do_parse!( - opt!(multispace) >> - tag_s!(r#"""#) >> - contents: take_until_s!(r#"""#) >> - tag_s!(r#"""#) >> - (Sexp::Str(contents.into())) - ) -); - -named!(symbol<&str, Sexp>, - do_parse!( - opt!(multispace) >> - peek!(valid_ident_prefix) >> - name: take_while1_s!(valid_ident_char) >> - (Sexp::Sym(name.into())) - ) -); - -fn valid_ident_prefix(ident: &str) -> IResult<&str, ()> { - match ident.chars().next() { - Some(c) if c != '#' && !c.is_digit(10) && valid_ident_char(c) => - IResult::Done(&ident[1..], ()), - None => IResult::Incomplete(nom::Needed::Unknown), - _ => IResult::Error(nom::ErrorKind::Custom(0)), - } + +// General Parsing Types /////////////////////////////////////////////////////// + +pub trait Span { + type Begin; + + fn offset(&self, begin: Self::Begin) -> Self; + fn begin(&self) -> Self::Begin; + fn union(&self, other: &Self) -> Self; } -fn valid_ident_char(c: char) -> bool { - !c.is_whitespace() && c != '"' && c != '(' && c != ')' +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum ParseResult<'a, T, E> { + Done(&'a str, T), + Error(E), } -named!(number<&str, Sexp>, - preceded!(opt!(multispace), - map_res!( - recognize!(do_parse!( - digit >> - is_float: opt!(complete!(tag_s!("."))) >> - opt!(complete!(digit)) >> - peek!(not!(valid_ident_prefix)) >> - () - )), - |text: &str| { - if text.contains(".") { - f64::from_str(text).map(Sexp::Float).or(Err(())) - } else { - i64::from_str(text).map(Sexp::Int).or(Err(())) - } - } - ) - ) -); - -named!(character<&str, Sexp>, - do_parse!( - opt!(multispace) >> - tag_s!(r#"#\"#) >> - character: take_s!(1) >> - (Sexp::Char(character.chars().next().unwrap())) - ) -); +use ParseResult::*; -#[cfg(test)] -#[test] -fn test_parse_number() { - assert_eq!(number("0"), IResult::Done("", Sexp::Int(0))); - assert_eq!(number("123"), IResult::Done("", Sexp::Int(123))); - assert_eq!(number("0123456789"), IResult::Done("", Sexp::Int(123456789))); - assert_eq!(number(" 42"), IResult::Done("", Sexp::Int(42))); - - assert_eq!(number("4."), IResult::Done("", Sexp::Float(4.))); - assert_eq!(number("4.2"), IResult::Done("", Sexp::Float(4.2))); - assert_eq!(number("1.00000000001"), - IResult::Done("", Sexp::Float(1.00000000001))); - - assert!(number(" 42a").is_err()); - assert_eq!(number("13()"), IResult::Done("()", Sexp::Int(13))); - - assert!(number("abc").is_err()); - assert!(number("()").is_err()); - assert!(number("").is_incomplete()); -} + +// Specific Parsing Types (ParseError, ByteSpan) /////////////////////////////// -#[cfg(test)] -#[test] -fn test_parse_ident() { - assert_eq!(symbol("+"), IResult::Done("", Sexp::Sym("+".into()))); - assert_eq!(symbol(" nil?"), IResult::Done("", Sexp::Sym("nil?".into()))); - assert_eq!(symbol(" ->socket"), IResult::Done("", Sexp::Sym("->socket".into()))); - assert_eq!(symbol("fib("), IResult::Done("(", Sexp::Sym("fib".into()))); - - // We reserve #foo for the implementation to do as it wishes - assert!(symbol("#hi").is_err()); - - assert!(symbol("0").is_err()); - assert!(symbol("()").is_err()); - assert!(symbol("").is_incomplete()); +/// Indicates how parsing failed. +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum ParseError where Loc: Span { + /// We can't explain how the parsing failed. + UnexpectedEof, + Number(Option>, Loc), + Unexpected(char, Loc::Begin), + Unimplemented, } -#[cfg(test)] -#[test] -fn test_parse_string() { - assert_eq!(string(r#""hello""#), IResult::Done("", Sexp::Str("hello".into()))); - assert_eq!(string(r#" "this is a nice string -with 0123 things in it""#), - IResult::Done("", Sexp::Str("this is a nice string\nwith 0123 things in it".into()))); - - assert!(string(r#""hi"#).is_err()); -} +type ByteSpan = (usize, usize); -#[cfg(test)] -#[test] -fn test_parse_char() { - assert_eq!(character(r#"#\""#), IResult::Done("", Sexp::Char('"'))); - assert_eq!(character(r#"#\ "#), IResult::Done("", Sexp::Char(' '))); - assert_eq!(character(r#" #\\"#), IResult::Done("", Sexp::Char('\\'))); - - assert!(character("#").is_incomplete()); - assert!(character("a").is_err()); +impl Span for ByteSpan { + type Begin = usize; + + fn offset(&self, begin: Self::Begin) -> Self { + (self.0 + begin, self.1 + begin) + } + + fn begin(&self) -> Self::Begin { + self.0 + } + + fn union(&self, other: &Self) -> Self { + use std::cmp::{min, max}; + (min(self.0, other.0), max(self.1, other.1)) + } } -#[cfg(test)] -#[test] -fn test_parse_list() { - assert_eq!(list("()"), IResult::Done("", vec![])); - assert_eq!(list("(1)"), IResult::Done("", vec![Sexp::Int(1)])); - assert_eq!(list(" ( 1 2 3 a )"), IResult::Done("", vec![ - Sexp::Int(1), - Sexp::Int(2), - Sexp::Int(3), - Sexp::Sym("a".into()), - ])); + + +// Parsing Utilities /////////////////////////////////////////////////////////// + +trait IsDelimeter { + fn is_delimiter(&self) -> bool; } -#[cfg(test)] -#[test] -fn test_parse_only_one() { - assert!(parse_one("1 2").is_err()); +impl IsDelimeter for char { + fn is_delimiter(&self) -> bool { + self.is_whitespace() || *self == ';' + || *self == '(' || *self == ')' + || *self == '[' || *self == ']' + || *self == '{' || *self == '}' + || *self == '"' || *self == '\'' + || *self == '`' || *self == ',' + } } -#[cfg(test)] -#[test] -fn test_parse_expression() { - assert_eq!(parse_one(r#" -(def (main) - (print (str "say " #\" "Hello, World" #\" " today!"))) -"#), - Ok(Sexp::List(vec![ - Sexp::Sym("def".into()), - Sexp::List( - vec![Sexp::Sym("main".into())] - ), - Sexp::List(vec![ - Sexp::Sym("print".into()), - Sexp::List(vec![ - Sexp::Sym("str".into()), - Sexp::Str("say ".into()), - Sexp::Char('"'), - Sexp::Str("Hello, World".into()), - Sexp::Char('"'), - Sexp::Str(" today!".into()), - ]) - ]) - ]))); + +// Parsers ///////////////////////////////////////////////////////////////////// + +// pub fn parse_one(input: &str) -> Result; + +// pub fn parse(input: &str) -> Result, ParseError>; + +pub fn parse_number(input: &str, start_loc: usize) -> ParseResult { + // Consume all the whitespace at the beginning of the string + let end_of_white = if let Some(pos) = input.find(|c: char| !c.is_whitespace()) { + pos + } else { + return Error(ParseError::Number( + Some(Box::new(ParseError::UnexpectedEof)), + (input.len(), input.len()).offset(start_loc))); + }; + + let input = &input[end_of_white..]; + let start_loc = start_loc + end_of_white; + + match input.chars().next() { + Some(c) if !c.is_digit(10) => { + return Error(ParseError::Number( + Some(Box::new(ParseError::Unexpected(c, 0))), + (0, c.len_utf8()).offset(start_loc))); + } + None => return Error(ParseError::Number( + Some(Box::new(ParseError::UnexpectedEof)), + (0, 0).offset(start_loc))), + _ => (), + } + + let base = 10; + + let mut end = 0; + // Before the decimal point + for (i, c) in input.char_indices() { + if c == '.' { + end = i + 1; + break; + } + + if c.is_delimiter() { + return Done(&input[i..], + Sexp::Int(input[..i].parse().expect("Already matched digits"), + (0, i).offset(start_loc))); + } + + if !c.is_digit(base) { + return Error(ParseError::Number( + Some(Box::new(ParseError::Unexpected(c, i))), + (i, i).offset(start_loc))); + } + + end = i + c.len_utf8(); + } + + if input[end..].is_empty() { + return Done(&input[end..], + Sexp::Int(input.parse().expect("Already matched digits"), + (0, end).offset(start_loc))); + } + + // After the decimal point + for (i, c) in input[end..].char_indices() { + if c.is_delimiter() { + return Done(&input[i+end..], + Sexp::Float(input[..end+i].parse().expect("Already matched digits.digits"), + (0, end+i).offset(start_loc))); + } + + if !c.is_digit(base) { + return Error(ParseError::Number( + Some(Box::new(ParseError::Unexpected(c, i + end))), + (i+end, i+end).offset(start_loc))); + } + } + + Done(&input[input.len()..], + Sexp::Float(input.parse().expect("Already matched digits.digits"), + (0, input.len()).offset(start_loc))) } + +// Tests /////////////////////////////////////////////////////////////////////// + #[cfg(test)] -#[test] -fn test_parse_multi() { - assert_eq!(parse(" 1 2 3 "), - Ok(vec![Sexp::Int(1), Sexp::Int(2), Sexp::Int(3)])); +mod test { + use super::*; + use super::ParseResult::*; + + #[test] + fn test_parse_number() { + assert_eq!(parse_number("1", 0), Done("", Sexp::Int(1, (0, 1)))); + assert_eq!(parse_number(" 13", 0), Done("", Sexp::Int(13, (1, 3)))); + assert_eq!(parse_number("1.2", 0), Done("", Sexp::Float(1.2, (0, 3)))); + assert_eq!(parse_number("\u{3000}4.2", 0), Done("", Sexp::Float(4.2, (0, 3).offset('\u{3000}'.len_utf8())))); + assert_eq!(parse_number(" 42 ", 0), Done(" ", Sexp::Int(42, (2, 4)))); + assert_eq!(parse_number(" 4.2 ", 0), Done(" ", Sexp::Float(4.2, (1, 4)))); + assert_eq!(parse_number("1()", 0), Done("()", Sexp::Int(1, (0, 1)))); + assert_eq!(parse_number("3.6()", 0), Done("()", Sexp::Float(3.6, (0, 3)))); + + assert_eq!(parse_number("", 0), Error(ParseError::Number(Some(Box::new(ParseError::UnexpectedEof)), (0, 0)))); + assert_eq!(parse_number("123a", 0), Error(ParseError::Number(Some(Box::new(ParseError::Unexpected('a', 3))), (3, 3)))); + assert_eq!(parse_number("66.6+", 0), Error(ParseError::Number(Some(Box::new(ParseError::Unexpected('+', 4))), (4, 4)))); + } } + +// #[cfg(test)] +// #[test] +// fn test_parse_ident() { +// assert_eq!(symbol("+"), IResult::Done("", Sexp::Sym("+".into()))); +// assert_eq!(symbol(" nil?"), IResult::Done("", Sexp::Sym("nil?".into()))); +// assert_eq!(symbol(" ->socket"), IResult::Done("", Sexp::Sym("->socket".into()))); +// assert_eq!(symbol("fib("), IResult::Done("(", Sexp::Sym("fib".into()))); + +// // We reserve #foo for the implementation to do as it wishes +// assert!(symbol("#hi").is_err()); + +// assert!(symbol("0").is_err()); +// assert!(symbol("()").is_err()); +// assert!(symbol("").is_incomplete()); +// } + +// #[cfg(test)] +// #[test] +// fn test_parse_string() { +// assert_eq!(string(r#""hello""#), IResult::Done("", Sexp::Str("hello".into()))); +// assert_eq!(string(r#" "this is a nice string +// with 0123 things in it""#), +// IResult::Done("", Sexp::Str("this is a nice string\nwith 0123 things in it".into()))); + +// assert!(string(r#""hi"#).is_err()); +// } + +// #[cfg(test)] +// #[test] +// fn test_parse_char() { +// assert_eq!(character(r#"#\""#), IResult::Done("", Sexp::Char('"'))); +// assert_eq!(character(r#"#\ "#), IResult::Done("", Sexp::Char(' '))); +// assert_eq!(character(r#" #\\"#), IResult::Done("", Sexp::Char('\\'))); + +// assert!(character("#").is_incomplete()); +// assert!(character("a").is_err()); +// } + + +// #[cfg(test)] +// #[test] +// fn test_parse_list() { +// assert_eq!(list("()"), IResult::Done("", vec![])); +// assert_eq!(list("(1)"), IResult::Done("", vec![Sexp::Int(1)])); +// assert_eq!(list(" ( 1 2 3 a )"), IResult::Done("", vec![ +// Sexp::Int(1), +// Sexp::Int(2), +// Sexp::Int(3), +// Sexp::Sym("a".into()), +// ])); +// } + +// #[cfg(test)] +// #[test] +// fn test_parse_only_one() { +// assert!(parse_one("1 2").is_err()); +// } + +// #[cfg(test)] +// #[test] +// fn test_parse_expression() { +// assert_eq!(parse_one(r#" +// (def (main) +// (print (str "say " #\" "Hello, World" #\" " today!"))) +// "#), +// Ok(Sexp::List(vec![ +// Sexp::Sym("def".into()), +// Sexp::List( +// vec![Sexp::Sym("main".into())] +// ), +// Sexp::List(vec![ +// Sexp::Sym("print".into()), +// Sexp::List(vec![ +// Sexp::Sym("str".into()), +// Sexp::Str("say ".into()), +// Sexp::Char('"'), +// Sexp::Str("Hello, World".into()), +// Sexp::Char('"'), +// Sexp::Str(" today!".into()), +// ]) +// ]) +// ]))); +// } + +// #[cfg(test)] +// #[test] +// fn test_parse_multi() { +// assert_eq!(parse(" 1 2 3 "), +// Ok(vec![Sexp::Int(1), Sexp::Int(2), Sexp::Int(3)])); +// } -- 2.43.2