]> Witch of Git - ess/blob - src/lib.rs
Add top level parsing functions
[ess] / src / lib.rs
1 //! A lightweight S-expression parser intended for language implementation.
2
3 // #![warn(missing_docs)]
4 #![deny(unsafe_code)]
5
6 use std::borrow::Cow;
7
8 /// A type representing arbitrary symbolic expressions. `Sexp` carries the
9 /// source code location it came from along with it for later diagnostic
10 /// purposes.
11 #[derive(Debug, PartialEq, Clone, PartialOrd)]
12 pub enum Sexp<'a, Loc=ByteSpan> where Loc: Span {
13 /// A value representing a symbol.
14 Sym(Cow<'a, str>, Loc),
15 /// A value representing a string literal.
16 Str(Cow<'a, str>, Loc),
17 /// A value representing a single character.
18 Char(char, Loc),
19 /// A value representing an integer. Any number containing no decimal point
20 /// will be parsed as an `Int`.
21 Int(i64, Loc),
22 /// A value representing a floating point number. Any number containing a
23 /// decimal point will be parsed as a `Float`.
24 Float(f64, Loc),
25 /// A list of subexpressions.
26 List(Vec<Sexp<'a, Loc>>, Loc),
27 }
28
29 impl<'a, Loc> Sexp<'a, Loc> where Loc: Span {
30 pub fn get_loc(&self) -> &Loc {
31 match *self {
32 Sexp::Sym(.., ref l) => l,
33 Sexp::Str(.., ref l) => l,
34 Sexp::Char(.., ref l) => l,
35 Sexp::Int(.., ref l) => l,
36 Sexp::Float(.., ref l) => l,
37 Sexp::List(.., ref l) => l,
38 }
39 }
40
41 pub fn get_loc_mut(&mut self) -> &mut Loc {
42 match *self {
43 Sexp::Sym(.., ref mut l) => l,
44 Sexp::Str(.., ref mut l) => l,
45 Sexp::Char(.., ref mut l) => l,
46 Sexp::Int(.., ref mut l) => l,
47 Sexp::Float(.., ref mut l) => l,
48 Sexp::List(.., ref mut l) => l,
49 }
50 }
51 }
52
53 \f
54 // General Parsing Types ///////////////////////////////////////////////////////
55
56 pub trait Span {
57 type Begin;
58
59 fn offset(&self, begin: Self::Begin) -> Self;
60 fn begin(&self) -> Self::Begin;
61 fn union(&self, other: &Self) -> Self;
62 }
63
64 #[derive(Debug, PartialEq, Eq, Clone)]
65 pub enum ParseResult<'a, T, E> {
66 Done(&'a str, T),
67 Error(E),
68 }
69
70 use ParseResult::*;
71
72 \f
73 // Specific Parsing Types (ParseError, ByteSpan) ///////////////////////////////
74
75 /// Indicates how parsing failed.
76 #[derive(Debug, PartialEq, Eq, Clone)]
77 pub enum ParseError<Loc=ByteSpan> where Loc: Span {
78 UnexpectedEof,
79 List(Box<ParseError>, Loc),
80 Sexp(Box<ParseError>, Loc),
81 Char(Box<ParseError>, Loc),
82 String(Box<ParseError>, Loc),
83 Symbol(Box<ParseError>, Loc),
84 Number(Box<ParseError>, Loc),
85 Unexpected(char, Loc::Begin),
86 Unimplemented,
87 }
88
89 type ByteSpan = (usize, usize);
90
91 impl Span for ByteSpan {
92 type Begin = usize;
93
94 fn offset(&self, begin: Self::Begin) -> Self {
95 (self.0 + begin, self.1 + begin)
96 }
97
98 fn begin(&self) -> Self::Begin {
99 self.0
100 }
101
102 fn union(&self, other: &Self) -> Self {
103 use std::cmp::{min, max};
104 (min(self.0, other.0), max(self.1, other.1))
105 }
106 }
107
108
109 \f
110 // Parsing Utilities ///////////////////////////////////////////////////////////
111
112 trait IsDelimeter {
113 fn is_delimiter(&self) -> bool;
114 }
115
116 impl IsDelimeter for char {
117 fn is_delimiter(&self) -> bool {
118 self.is_whitespace() || *self == ';'
119 || *self == '(' || *self == ')'
120 || *self == '[' || *self == ']'
121 || *self == '{' || *self == '}'
122 || *self == '"' || *self == '\''
123 || *self == '`' || *self == ','
124 }
125 }
126
127 macro_rules! consume_whitespace {
128 ($input:expr, $start_loc:expr, $ErrorFn:expr) => {
129 if let Some(pos) = $input.find(|c: char| !c.is_whitespace()) {
130 (&$input[pos..], $start_loc + pos)
131 } else {
132 return Error($ErrorFn(
133 Box::new(ParseError::UnexpectedEof),
134 ($input.len(), $input.len()).offset($start_loc)));
135 }
136 }
137 }
138
139 \f
140 // Top Level Parsers ///////////////////////////////////////////////////////////
141
142 pub fn parse_one(input: &str) -> Result<(Sexp, &str), ParseError> {
143 match parse_sexp(input, 0) {
144 Done(rest, result) => Ok((result, rest)),
145 Error(err) => Err(err),
146 }
147 }
148
149 pub fn parse(mut input: &str) -> (Vec<Sexp>, Option<ParseError>) {
150 let mut start_loc = 0;
151 let mut results = Vec::new();
152 loop {
153 match parse_sexp(input, start_loc) {
154 Done(rest, result) => {
155 input = rest;
156 start_loc = result.get_loc().1;
157 results.push(result);
158 if rest.trim() == "" {
159 return (results, None);
160 }
161 }
162 Error(err) => {
163 return (results, Some(err));
164 }
165 }
166 }
167 }
168
169 \f
170 // Core Parsers ////////////////////////////////////////////////////////////////
171
172 pub fn parse_sexp(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
173 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::Sexp);
174
175 match input.chars().next() {
176 Some('0'...'9') => parse_number(input, start_loc),
177 Some('(') => parse_list(input, start_loc),
178 Some('#') => parse_character(input, start_loc),
179 Some('"') => parse_string(input, start_loc),
180 Some(_) => parse_symbol(input, start_loc),
181 None => unreachable!(),
182 }
183 }
184
185 pub fn parse_list(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
186 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::List);
187
188 match input.chars().nth(0) {
189 Some('(') => (),
190 Some(c) =>
191 return Error(ParseError::List(
192 Box::new(ParseError::Unexpected(c, 0)),
193 (0, 0).offset(start_loc))),
194 None => unreachable!(),
195 }
196
197 let mut input = &input[1..];
198 let mut loc = start_loc + 1;
199 let mut members = Vec::new();
200 println!("!{}", loc);
201 loop {
202 {
203 let (new_input, new_loc) = consume_whitespace!(input, loc, ParseError::List);
204 input = new_input;
205 loc = new_loc;
206 println!("{}", loc);
207 }
208
209 match input.chars().nth(0) {
210 Some(')') =>
211 return Done(&input[1..],
212 Sexp::List(members, (start_loc, loc+1))),
213 Some(_) => (),
214 None => unreachable!(),
215 }
216
217 match parse_sexp(input, loc) {
218 Done(new_input, member) => {
219 loc = member.get_loc().1;
220 members.push(member);
221 input = new_input;
222 }
223 Error(err) =>
224 return Error(ParseError::List(
225 Box::new(err),
226 (0, 0).offset(loc)))
227 }
228 }
229 }
230
231 pub fn parse_number(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
232 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::Number);
233
234 match input.chars().next() {
235 Some(c) if !c.is_digit(10) => {
236 return Error(ParseError::Number(
237 Box::new(ParseError::Unexpected(c, start_loc)),
238 (0, c.len_utf8()).offset(start_loc)));
239 }
240 None => return Error(ParseError::Number(
241 Box::new(ParseError::UnexpectedEof),
242 (0, 0).offset(start_loc))),
243 _ => (),
244 }
245
246 let base = 10;
247
248 let mut end = 0;
249 // Before the decimal point
250 for (i, c) in input.char_indices() {
251 if c == '.' {
252 end = i + 1;
253 break;
254 }
255
256 if c.is_delimiter() {
257 return Done(&input[i..],
258 Sexp::Int(input[..i].parse().expect("Already matched digits"),
259 (0, i).offset(start_loc)));
260 }
261
262 if !c.is_digit(base) {
263 return Error(ParseError::Number(
264 Box::new(ParseError::Unexpected(c, start_loc + i)),
265 (i, i).offset(start_loc)));
266 }
267
268 end = i + c.len_utf8();
269 }
270
271 if input[end..].is_empty() {
272 return Done(&input[end..],
273 Sexp::Int(input.parse().expect("Already matched digits"),
274 (0, end).offset(start_loc)));
275 }
276
277 // After the decimal point
278 for (i, c) in input[end..].char_indices() {
279 if c.is_delimiter() {
280 return Done(&input[i+end..],
281 Sexp::Float(input[..end+i].parse().expect("Already matched digits.digits"),
282 (0, end+i).offset(start_loc)));
283 }
284
285 if !c.is_digit(base) {
286 return Error(ParseError::Number(
287 Box::new(ParseError::Unexpected(c, start_loc + i + end)),
288 (i+end, i+end).offset(start_loc)));
289 }
290 }
291
292 Done(&input[input.len()..],
293 Sexp::Float(input.parse().expect("Already matched digits.digits"),
294 (0, input.len()).offset(start_loc)))
295 }
296
297 pub fn parse_symbol(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
298 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::Symbol);
299
300 match input.chars().next() {
301 Some(c@'#') | Some(c@':') | Some(c@'0'...'9') =>
302 return Error(ParseError::Symbol(
303 Box::new(ParseError::Unexpected(c, start_loc)),
304 (0, 0).offset(start_loc))),
305 Some(c) if c.is_delimiter() =>
306 return Error(ParseError::Symbol(
307 Box::new(ParseError::Unexpected(c, start_loc)),
308 (0, 0).offset(start_loc))),
309 Some(_) => (),
310 None => unreachable!(),
311 }
312
313 for (i, c) in input.char_indices() {
314 if c.is_delimiter() {
315 return Done(&input[i..],
316 Sexp::Sym(input[..i].into(), (0, i).offset(start_loc)));
317 }
318 }
319
320 Done(&input[input.len()..],
321 Sexp::Sym(input.into(), (0, input.len()).offset(start_loc)))
322 }
323
324 pub fn parse_string(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
325 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::String);
326
327 match input.chars().next() {
328 Some('"') => (),
329 Some(c) =>
330 return Error(ParseError::String(
331 Box::new(ParseError::Unexpected(c, start_loc)),
332 (0, 0).offset(start_loc))),
333 None => unreachable!(),
334 }
335
336 for (i, c) in input[1..].char_indices() {
337 if c == '"' {
338 return Done(&input[2+i..],
339 Sexp::Str(input[1..i+1].into(), (0, i+2).offset(start_loc)));
340 }
341 }
342
343 Error(ParseError::String(
344 Box::new(ParseError::UnexpectedEof),
345 (0, input.len()).offset(start_loc)))
346 }
347
348 pub fn parse_character(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
349 let (input, start_loc) = consume_whitespace!(input, start_loc, ParseError::Char);
350
351 match input.chars().nth(0) {
352 Some('#') => (),
353 Some(c) =>
354 return Error(ParseError::Char(
355 Box::new(ParseError::Unexpected(c, start_loc)),
356 (0, 0).offset(start_loc))),
357 None =>
358 return Error(ParseError::Char(
359 Box::new(ParseError::UnexpectedEof),
360 (0, 0).offset(start_loc))),
361 }
362
363 match input.chars().nth(1) {
364 Some('\\') => (),
365 Some(c) =>
366 return Error(ParseError::Char(
367 Box::new(ParseError::Unexpected(c, start_loc + 1)),
368 (1, 1).offset(start_loc))),
369 None =>
370 return Error(ParseError::Char(
371 Box::new(ParseError::UnexpectedEof),
372 (1, 1).offset(start_loc)))
373 }
374
375 match input.chars().nth(2) {
376 Some(c) =>
377 Done(&input[3..], Sexp::Char(c, (0, 3).offset(start_loc))),
378 None =>
379 Error(ParseError::Char(
380 Box::new(ParseError::UnexpectedEof),
381 (2, 2).offset(start_loc)))
382 }
383 }
384
385 \f
386 // Tests ///////////////////////////////////////////////////////////////////////
387
388 #[cfg(test)]
389 mod test {
390 use super::*;
391 use super::ParseResult::*;
392
393 #[test]
394 fn test_parse() {
395 assert_eq!(parse("1 2 3"), (vec![
396 Sexp::Int(1, (0, 1)), Sexp::Int(2, (2, 3)), Sexp::Int(3, (4, 5))
397 ], None));
398 assert_eq!(parse("1 2 )"), (vec![
399 Sexp::Int(1, (0, 1)), Sexp::Int(2, (2, 3))
400 ], Some(ParseError::Symbol(Box::new(ParseError::Unexpected(')', 4)), (4, 4)))));
401 }
402
403 #[test]
404 fn test_parse_one() {
405 assert_eq!(parse_one("1 2"), Ok((Sexp::Int(1, (0, 1)), " 2")));
406 }
407
408 #[test]
409 fn test_parse_sexp() {
410 assert_eq!(parse_sexp(" 1", 0), Done("", Sexp::Int(1, (1, 2))));
411 assert_eq!(parse_sexp("2.2", 0), Done("", Sexp::Float(2.2, (0, 3))));
412 assert_eq!(parse_sexp(" a", 0), Done("", Sexp::Sym("a".into(), (1, 2))));
413 assert_eq!(parse_sexp("#\\c", 0), Done("", Sexp::Char('c', (0, 3))));
414 assert_eq!(parse_sexp(r#""hi""#, 0), Done("", Sexp::Str("hi".into(), (0, 4))));
415 assert_eq!(parse_sexp("()", 0), Done("", Sexp::List(vec![], (0, 2))));
416 assert_eq!(parse_sexp("( 1 2 3 )", 0), Done("", Sexp::List(vec![
417 Sexp::Int(1, (2, 3)),
418 Sexp::Int(2, (4, 5)),
419 Sexp::Int(3, (6, 7)),
420 ], (0, 9))));
421
422 assert_eq!(parse_sexp("", 0), Error(ParseError::Sexp(Box::new(ParseError::UnexpectedEof), (0, 0))));
423 }
424
425 #[test]
426 fn test_parse_list() {
427 assert_eq!(parse_list("()", 0), Done("", Sexp::List(vec![], (0, 2))));
428 assert_eq!(parse_list("(1)", 0), Done("", Sexp::List(vec![Sexp::Int(1, (1, 2))], (0, 3))));
429 assert_eq!(parse_list(" ( 1 2 3 a )", 0), Done("", Sexp::List(vec![
430 Sexp::Int(1, (4, 5)),
431 Sexp::Int(2, (9, 10)),
432 Sexp::Int(3, (12, 13)),
433 Sexp::Sym("a".into(), (14, 15)),
434 ], (2, 17))));
435 }
436
437 #[test]
438 fn test_parse_number() {
439 assert_eq!(parse_number("1", 0), Done("", Sexp::Int(1, (0, 1))));
440 assert_eq!(parse_number(" 13", 0), Done("", Sexp::Int(13, (1, 3))));
441 assert_eq!(parse_number("1.2", 0), Done("", Sexp::Float(1.2, (0, 3))));
442 assert_eq!(parse_number("\u{3000}4.2", 0), Done("", Sexp::Float(4.2, (0, 3).offset('\u{3000}'.len_utf8()))));
443 assert_eq!(parse_number(" 42 ", 0), Done(" ", Sexp::Int(42, (2, 4))));
444 assert_eq!(parse_number(" 4.2 ", 0), Done(" ", Sexp::Float(4.2, (1, 4))));
445 assert_eq!(parse_number("1()", 0), Done("()", Sexp::Int(1, (0, 1))));
446 assert_eq!(parse_number("3.6()", 0), Done("()", Sexp::Float(3.6, (0, 3))));
447
448 assert_eq!(parse_number("", 0), Error(ParseError::Number(Box::new(ParseError::UnexpectedEof), (0, 0))));
449 assert_eq!(parse_number("123a", 0), Error(ParseError::Number(Box::new(ParseError::Unexpected('a', 3)), (3, 3))));
450 assert_eq!(parse_number("66.6+", 0), Error(ParseError::Number(Box::new(ParseError::Unexpected('+', 4)), (4, 4))));
451 }
452
453 #[test]
454 fn test_parse_ident() {
455 assert_eq!(parse_symbol("+", 0), Done("", Sexp::Sym("+".into(), (0, 1))));
456 assert_eq!(parse_symbol(" nil?", 0), Done("", Sexp::Sym("nil?".into(), (1, 5))));
457 assert_eq!(parse_symbol(" ->socket", 0), Done("", Sexp::Sym("->socket".into(), (1, 9))));
458 assert_eq!(parse_symbol("fib(", 0), Done("(", Sexp::Sym("fib".into(), (0, 3))));
459 assert_eq!(parse_symbol("foo2", 0), Done("", Sexp::Sym("foo2".into(), (0, 4))));
460
461 // We reserve #foo for the implementation to do as it wishes
462 assert_eq!(parse_symbol("#hi", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('#', 0)), (0, 0))));
463 // We reserve :foo for keywords
464 assert_eq!(parse_symbol(":hi", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected(':', 0)), (0, 0))));
465
466 assert_eq!(parse_symbol("", 0), Error(ParseError::Symbol(Box::new(ParseError::UnexpectedEof), (0, 0))));
467 assert_eq!(parse_symbol("0", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('0', 0)), (0, 0))));
468 assert_eq!(parse_symbol("()", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('(', 0)), (0, 0))));
469 }
470
471 #[test]
472 fn test_parse_string() {
473 assert_eq!(parse_string(r#""""#, 0), Done("", Sexp::Str("".into(), (0, 2))));
474 assert_eq!(parse_string(r#""hello""#, 0), Done("", Sexp::Str("hello".into(), (0, 7))));
475 assert_eq!(parse_string(r#" "this is a nice string
476 with 0123 things in it""#, 0),
477 Done("", Sexp::Str("this is a nice string\nwith 0123 things in it".into(), (2, 48))));
478
479 assert_eq!(parse_string("", 0), Error(ParseError::String(Box::new(ParseError::UnexpectedEof), (0, 0))));
480 assert_eq!(parse_string(r#""hi"#, 0), Error(ParseError::String(Box::new(ParseError::UnexpectedEof), (0, 3))));
481 }
482
483 #[test]
484 fn test_parse_char() {
485 assert_eq!(parse_character(r#"#\""#, 0), Done("", Sexp::Char('"', (0, 3))));
486 assert_eq!(parse_character(r#"#\ "#, 0), Done("", Sexp::Char(' ', (0, 3))));
487 assert_eq!(parse_character(r#" #\\"#, 0), Done("", Sexp::Char('\\', (2, 5))));
488
489 assert_eq!(parse_character("", 0), Error(ParseError::Char(Box::new(ParseError::UnexpectedEof), (0, 0))));
490 assert_eq!(parse_character("#", 0), Error(ParseError::Char(Box::new(ParseError::UnexpectedEof), (1, 1))));
491 assert_eq!(parse_character("#\\", 0), Error(ParseError::Char(Box::new(ParseError::UnexpectedEof), (2, 2))));
492 assert_eq!(parse_character("a", 0), Error(ParseError::Char(Box::new(ParseError::Unexpected('a', 0)), (0, 0))));
493 }
494 }