]> Witch of Git - ess/blob - src/lib.rs
Parse strings
[ess] / src / lib.rs
1 //! A lightweight S-expression parser intended for language implementation.
2
3 // #![warn(missing_docs)]
4 #![deny(unsafe_code)]
5
6 use std::borrow::Cow;
7
8 /// A type representing arbitrary symbolic expressions. `Sexp` carries the
9 /// source code location it came from along with it for later diagnostic
10 /// purposes.
11 #[derive(Debug, PartialEq, Clone, PartialOrd)]
12 pub enum Sexp<'a, Loc=ByteSpan> where Loc: Span {
13 /// A value representing a symbol.
14 Sym(Cow<'a, str>, Loc),
15 /// A value representing a string literal.
16 Str(Cow<'a, str>, Loc),
17 /// A value representing a single character.
18 Char(char, Loc),
19 /// A value representing an integer. Any number containing no decimal point
20 /// will be parsed as an `Int`.
21 Int(i64, Loc),
22 /// A value representing a floating point number. Any number containing a
23 /// decimal point will be parsed as a `Float`.
24 Float(f64, Loc),
25 /// A list of subexpressions.
26 List(Vec<Sexp<'a, Loc>>, Loc),
27 }
28
29 impl<'a, Loc> Sexp<'a, Loc> where Loc: Span {
30 pub fn get_loc(&self) -> &Loc {
31 match *self {
32 Sexp::Sym(.., ref l) => l,
33 Sexp::Str(.., ref l) => l,
34 Sexp::Char(.., ref l) => l,
35 Sexp::Int(.., ref l) => l,
36 Sexp::Float(.., ref l) => l,
37 Sexp::List(.., ref l) => l,
38 }
39 }
40
41 pub fn get_loc_mut(&mut self) -> &mut Loc {
42 match *self {
43 Sexp::Sym(.., ref mut l) => l,
44 Sexp::Str(.., ref mut l) => l,
45 Sexp::Char(.., ref mut l) => l,
46 Sexp::Int(.., ref mut l) => l,
47 Sexp::Float(.., ref mut l) => l,
48 Sexp::List(.., ref mut l) => l,
49 }
50 }
51 }
52
53 \f
54 // General Parsing Types ///////////////////////////////////////////////////////
55
56 pub trait Span {
57 type Begin;
58
59 fn offset(&self, begin: Self::Begin) -> Self;
60 fn begin(&self) -> Self::Begin;
61 fn union(&self, other: &Self) -> Self;
62 }
63
64 #[derive(Debug, PartialEq, Eq, Clone)]
65 pub enum ParseResult<'a, T, E> {
66 Done(&'a str, T),
67 Error(E),
68 }
69
70 use ParseResult::*;
71
72 \f
73 // Specific Parsing Types (ParseError, ByteSpan) ///////////////////////////////
74
75 /// Indicates how parsing failed.
76 #[derive(Debug, PartialEq, Eq, Clone)]
77 pub enum ParseError<Loc=ByteSpan> where Loc: Span {
78 /// We can't explain how the parsing failed.
79 UnexpectedEof,
80 String(Box<ParseError>, Loc),
81 Symbol(Box<ParseError>, Loc),
82 Number(Box<ParseError>, Loc),
83 Unexpected(char, Loc::Begin),
84 Unimplemented,
85 }
86
87 type ByteSpan = (usize, usize);
88
89 impl Span for ByteSpan {
90 type Begin = usize;
91
92 fn offset(&self, begin: Self::Begin) -> Self {
93 (self.0 + begin, self.1 + begin)
94 }
95
96 fn begin(&self) -> Self::Begin {
97 self.0
98 }
99
100 fn union(&self, other: &Self) -> Self {
101 use std::cmp::{min, max};
102 (min(self.0, other.0), max(self.1, other.1))
103 }
104 }
105
106
107 \f
108 // Parsing Utilities ///////////////////////////////////////////////////////////
109
110 trait IsDelimeter {
111 fn is_delimiter(&self) -> bool;
112 }
113
114 impl IsDelimeter for char {
115 fn is_delimiter(&self) -> bool {
116 self.is_whitespace() || *self == ';'
117 || *self == '(' || *self == ')'
118 || *self == '[' || *self == ']'
119 || *self == '{' || *self == '}'
120 || *self == '"' || *self == '\''
121 || *self == '`' || *self == ','
122 }
123 }
124
125 \f
126 // Parsers /////////////////////////////////////////////////////////////////////
127
128 // pub fn parse_one(input: &str) -> Result<Sexp, ParseError>;
129
130 // pub fn parse(input: &str) -> Result<Vec<Sexp>, ParseError>;
131
132 pub fn parse_number(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
133 // Consume all the whitespace at the beginning of the string
134 let end_of_white = if let Some(pos) = input.find(|c: char| !c.is_whitespace()) {
135 pos
136 } else {
137 return Error(ParseError::Number(
138 Box::new(ParseError::UnexpectedEof),
139 (input.len(), input.len()).offset(start_loc)));
140 };
141
142 let input = &input[end_of_white..];
143 let start_loc = start_loc + end_of_white;
144
145 match input.chars().next() {
146 Some(c) if !c.is_digit(10) => {
147 return Error(ParseError::Number(
148 Box::new(ParseError::Unexpected(c, start_loc)),
149 (0, c.len_utf8()).offset(start_loc)));
150 }
151 None => return Error(ParseError::Number(
152 Box::new(ParseError::UnexpectedEof),
153 (0, 0).offset(start_loc))),
154 _ => (),
155 }
156
157 let base = 10;
158
159 let mut end = 0;
160 // Before the decimal point
161 for (i, c) in input.char_indices() {
162 if c == '.' {
163 end = i + 1;
164 break;
165 }
166
167 if c.is_delimiter() {
168 return Done(&input[i..],
169 Sexp::Int(input[..i].parse().expect("Already matched digits"),
170 (0, i).offset(start_loc)));
171 }
172
173 if !c.is_digit(base) {
174 return Error(ParseError::Number(
175 Box::new(ParseError::Unexpected(c, start_loc + i)),
176 (i, i).offset(start_loc)));
177 }
178
179 end = i + c.len_utf8();
180 }
181
182 if input[end..].is_empty() {
183 return Done(&input[end..],
184 Sexp::Int(input.parse().expect("Already matched digits"),
185 (0, end).offset(start_loc)));
186 }
187
188 // After the decimal point
189 for (i, c) in input[end..].char_indices() {
190 if c.is_delimiter() {
191 return Done(&input[i+end..],
192 Sexp::Float(input[..end+i].parse().expect("Already matched digits.digits"),
193 (0, end+i).offset(start_loc)));
194 }
195
196 if !c.is_digit(base) {
197 return Error(ParseError::Number(
198 Box::new(ParseError::Unexpected(c, start_loc + i + end)),
199 (i+end, i+end).offset(start_loc)));
200 }
201 }
202
203 Done(&input[input.len()..],
204 Sexp::Float(input.parse().expect("Already matched digits.digits"),
205 (0, input.len()).offset(start_loc)))
206 }
207
208 pub fn parse_symbol(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
209 let end_of_white = if let Some(pos) = input.find(|c: char| !c.is_whitespace()) {
210 pos
211 } else {
212 return Error(ParseError::Symbol(
213 Box::new(ParseError::UnexpectedEof),
214 (input.len(), input.len()).offset(start_loc)));
215 };
216
217 let input = &input[end_of_white..];
218 let start_loc = start_loc + end_of_white;
219
220 match input.chars().next() {
221 Some(c@'#') | Some(c@':') | Some(c@'0'...'9') =>
222 return Error(ParseError::Symbol(
223 Box::new(ParseError::Unexpected(c, start_loc)),
224 (0, 0).offset(start_loc))),
225 Some(c) if c.is_delimiter() =>
226 return Error(ParseError::Symbol(
227 Box::new(ParseError::Unexpected(c, start_loc)),
228 (0, 0).offset(start_loc))),
229 Some(_) => (),
230 None => unreachable!(),
231 }
232
233 for (i, c) in input.char_indices() {
234 if c.is_delimiter() {
235 return Done(&input[i..],
236 Sexp::Sym(input[..i].into(), (0, i).offset(start_loc)));
237 }
238 }
239
240 Done(&input[input.len()..],
241 Sexp::Sym(input.into(), (0, input.len()).offset(start_loc)))
242 }
243
244 pub fn parse_string(input: &str, start_loc: usize) -> ParseResult<Sexp, ParseError> {
245 let end_of_white = if let Some(pos) = input.find(|c: char| !c.is_whitespace()) {
246 pos
247 } else {
248 return Error(ParseError::String(
249 Box::new(ParseError::UnexpectedEof),
250 (input.len(), input.len()).offset(start_loc)));
251 };
252
253 let input = &input[end_of_white..];
254 let start_loc = start_loc + end_of_white;
255
256 match input.chars().next() {
257 Some('"') => (),
258 Some(c) =>
259 return Error(ParseError::String(
260 Box::new(ParseError::Unexpected(c, start_loc)),
261 (0, 0).offset(start_loc))),
262 None => unreachable!(),
263 }
264
265 for (i, c) in input[1..].char_indices() {
266 if c == '"' {
267 return Done(&input[2+i..],
268 Sexp::Str(input[1..i+1].into(), (0, i+2).offset(start_loc)));
269 }
270 }
271
272 Error(ParseError::String(
273 Box::new(ParseError::UnexpectedEof),
274 (0, input.len()).offset(start_loc)))
275 }
276
277 \f
278 // Tests ///////////////////////////////////////////////////////////////////////
279
280 #[cfg(test)]
281 mod test {
282 use super::*;
283 use super::ParseResult::*;
284
285 #[test]
286 fn test_parse_number() {
287 assert_eq!(parse_number("1", 0), Done("", Sexp::Int(1, (0, 1))));
288 assert_eq!(parse_number(" 13", 0), Done("", Sexp::Int(13, (1, 3))));
289 assert_eq!(parse_number("1.2", 0), Done("", Sexp::Float(1.2, (0, 3))));
290 assert_eq!(parse_number("\u{3000}4.2", 0), Done("", Sexp::Float(4.2, (0, 3).offset('\u{3000}'.len_utf8()))));
291 assert_eq!(parse_number(" 42 ", 0), Done(" ", Sexp::Int(42, (2, 4))));
292 assert_eq!(parse_number(" 4.2 ", 0), Done(" ", Sexp::Float(4.2, (1, 4))));
293 assert_eq!(parse_number("1()", 0), Done("()", Sexp::Int(1, (0, 1))));
294 assert_eq!(parse_number("3.6()", 0), Done("()", Sexp::Float(3.6, (0, 3))));
295
296 assert_eq!(parse_number("", 0), Error(ParseError::Number(Box::new(ParseError::UnexpectedEof), (0, 0))));
297 assert_eq!(parse_number("123a", 0), Error(ParseError::Number(Box::new(ParseError::Unexpected('a', 3)), (3, 3))));
298 assert_eq!(parse_number("66.6+", 0), Error(ParseError::Number(Box::new(ParseError::Unexpected('+', 4)), (4, 4))));
299 }
300
301 #[test]
302 fn test_parse_ident() {
303 assert_eq!(parse_symbol("+", 0), Done("", Sexp::Sym("+".into(), (0, 1))));
304 assert_eq!(parse_symbol(" nil?", 0), Done("", Sexp::Sym("nil?".into(), (1, 5))));
305 assert_eq!(parse_symbol(" ->socket", 0), Done("", Sexp::Sym("->socket".into(), (1, 9))));
306 assert_eq!(parse_symbol("fib(", 0), Done("(", Sexp::Sym("fib".into(), (0, 3))));
307 assert_eq!(parse_symbol("foo2", 0), Done("", Sexp::Sym("foo2".into(), (0, 4))));
308
309 // We reserve #foo for the implementation to do as it wishes
310 assert_eq!(parse_symbol("#hi", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('#', 0)), (0, 0))));
311 // We reserve :foo for keywords
312 assert_eq!(parse_symbol(":hi", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected(':', 0)), (0, 0))));
313
314 assert_eq!(parse_symbol("", 0), Error(ParseError::Symbol(Box::new(ParseError::UnexpectedEof), (0, 0))));
315 assert_eq!(parse_symbol("0", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('0', 0)), (0, 0))));
316 assert_eq!(parse_symbol("()", 0), Error(ParseError::Symbol(Box::new(ParseError::Unexpected('(', 0)), (0, 0))));
317 }
318
319 #[test]
320 fn test_parse_string() {
321 assert_eq!(parse_string(r#""""#, 0), Done("", Sexp::Str("".into(), (0, 2))));
322 assert_eq!(parse_string(r#""hello""#, 0), Done("", Sexp::Str("hello".into(), (0, 7))));
323 assert_eq!(parse_string(r#" "this is a nice string
324 with 0123 things in it""#, 0),
325 Done("", Sexp::Str("this is a nice string\nwith 0123 things in it".into(), (2, 48))));
326 assert_eq!(parse_string(r#""hi"#, 0), Error(ParseError::String(Box::new(ParseError::UnexpectedEof), (0, 3))));
327 }
328 }
329
330 // #[cfg(test)]
331 // #[test]
332 // fn test_parse_char() {
333 // assert_eq!(character(r#"#\""#), IResult::Done("", Sexp::Char('"')));
334 // assert_eq!(character(r#"#\ "#), IResult::Done("", Sexp::Char(' ')));
335 // assert_eq!(character(r#" #\\"#), IResult::Done("", Sexp::Char('\\')));
336
337 // assert!(character("#").is_incomplete());
338 // assert!(character("a").is_err());
339 // }
340
341
342 // #[cfg(test)]
343 // #[test]
344 // fn test_parse_list() {
345 // assert_eq!(list("()"), IResult::Done("", vec![]));
346 // assert_eq!(list("(1)"), IResult::Done("", vec![Sexp::Int(1)]));
347 // assert_eq!(list(" ( 1 2 3 a )"), IResult::Done("", vec![
348 // Sexp::Int(1),
349 // Sexp::Int(2),
350 // Sexp::Int(3),
351 // Sexp::Sym("a".into()),
352 // ]));
353 // }
354
355 // #[cfg(test)]
356 // #[test]
357 // fn test_parse_only_one() {
358 // assert!(parse_one("1 2").is_err());
359 // }
360
361 // #[cfg(test)]
362 // #[test]
363 // fn test_parse_expression() {
364 // assert_eq!(parse_one(r#"
365 // (def (main)
366 // (print (str "say " #\" "Hello, World" #\" " today!")))
367 // "#),
368 // Ok(Sexp::List(vec![
369 // Sexp::Sym("def".into()),
370 // Sexp::List(
371 // vec![Sexp::Sym("main".into())]
372 // ),
373 // Sexp::List(vec![
374 // Sexp::Sym("print".into()),
375 // Sexp::List(vec![
376 // Sexp::Sym("str".into()),
377 // Sexp::Str("say ".into()),
378 // Sexp::Char('"'),
379 // Sexp::Str("Hello, World".into()),
380 // Sexp::Char('"'),
381 // Sexp::Str(" today!".into()),
382 // ])
383 // ])
384 // ])));
385 // }
386
387 // #[cfg(test)]
388 // #[test]
389 // fn test_parse_multi() {
390 // assert_eq!(parse(" 1 2 3 "),
391 // Ok(vec![Sexp::Int(1), Sexp::Int(2), Sexp::Int(3)]));
392 // }