1use std::iter::Peekable;
2use std::str::Chars;
3
4pub struct Reader<'l> {
5 line: usize,
6 column: usize,
7 offset: usize,
8 content: &'l str,
9 cursor: Peekable<Chars<'l>>,
10}
11
12impl<'l> Reader<'l> {
13 #[inline]
14 pub fn new(content: &'l str) -> Self {
15 Reader {
16 line: 1,
17 column: 1,
18 offset: 0,
19 content,
20 cursor: content.chars().peekable(),
21 }
22 }
23
24 pub fn capture<F>(&mut self, block: F) -> Option<&'l str>
25 where
26 F: Fn(&mut Reader<'l>) -> bool,
27 {
28 let start = self.offset;
29 if !block(self) {
30 return None;
31 }
32 let content = &self.content[start..self.offset].trim();
33 if content.is_empty() {
34 None
35 } else {
36 Some(content)
37 }
38 }
39
40 #[inline]
41 pub fn consume_all(&mut self) -> bool {
42 self.consume_while(|_| true)
43 }
44
45 #[inline]
46 pub fn consume_any(&mut self, targets: &str) -> bool {
47 self.consume_while(|c| targets.contains(c))
48 }
49
50 pub fn consume_attribute(&mut self) -> bool {
52 self.consume_name() && self.consume_equality() && self.consume_attribute_value()
53 }
54
55 pub fn consume_attribute_value(&mut self) -> bool {
57 let single;
58 if self.consume_char('\'') {
59 single = true;
60 } else if self.consume_char('"') {
61 single = false;
62 } else {
63 return false;
64 }
65 loop {
66 self.consume_until_any(if single { "<&'" } else { "<&\"" });
67 match self.peek() {
68 Some('&') => {
69 if !self.consume_reference() {
70 return false;
71 }
72 }
73 _ => break,
74 }
75 }
76 self.consume_char(if single { '\'' } else { '"' })
77 }
78
79 pub fn consume_char(&mut self, target: char) -> bool {
80 match self.peek() {
81 Some(c) if c == target => {
82 self.next();
83 true
84 }
85 _ => false,
86 }
87 }
88
89 pub fn consume_character(&mut self) -> bool {
90 self.consume_if(Reader::check_character)
91 }
92
93 pub fn consume_comment(&mut self) -> bool {
95 self.consume_char('<')
96 && self.consume_char('!')
97 && self.consume_char('-')
98 && self.consume_char('-')
99 && {
100 self.consume_comment_body();
101 true
102 }
103 && self.consume_char('-')
104 && self.consume_char('-')
105 && self.consume_char('>')
106 }
107
108 pub fn consume_comment_body(&mut self) -> bool {
109 let mut consumed = true;
110 while let Some(c) = self.peek() {
111 if c == '-' {
112 let mut iterator = self
113 .peek_many()
114 .take(2)
115 .map(|c| c != '-' && Reader::check_character(c));
116 match (iterator.next(), iterator.next()) {
117 (Some(false), Some(false)) => break,
118 (Some(false), Some(true)) => {
119 assert!(self.consume_char('-'));
120 consumed = true;
121 }
122 (Some(false), None) => break,
123 _ => unreachable!(),
124 }
125 } else if self.consume_character() {
126 consumed = true;
127 } else {
128 break;
129 }
130 }
131 consumed
132 }
133
134 pub fn consume_declaration(&mut self) -> bool {
135 self.consume_char('<')
136 && self.consume_char('!')
137 && self.consume_until_char('>')
138 && self.consume_char('>')
139 }
140
141 #[inline]
142 pub fn consume_digits(&mut self) -> bool {
143 self.consume_while(|c| c.is_ascii_digit())
144 }
145
146 #[inline]
147 pub fn consume_digits_hex(&mut self) -> bool {
148 self.consume_while(|c| {
149 c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
150 })
151 }
152
153 pub fn consume_equality(&mut self) -> bool {
155 self.consume_whitespace();
156 let consumed = self.consume_char('=');
157 self.consume_whitespace();
158 consumed
159 }
160
161 pub fn consume_if<F>(&mut self, check: F) -> bool
162 where
163 F: Fn(char) -> bool,
164 {
165 match self.peek() {
166 Some(c) => {
167 if check(c) {
168 self.next();
169 true
170 } else {
171 false
172 }
173 }
174 _ => false,
175 }
176 }
177
178 pub fn consume_instruction(&mut self) -> bool {
179 self.consume_char('<')
180 && self.consume_char('?')
181 && self.consume_until_char('>')
182 && self.consume_char('>')
183 }
184
185 pub fn consume_name(&mut self) -> bool {
187 self.consume_name_start_character() && {
188 while self.consume_name_character() {}
189 true
190 }
191 }
192
193 pub fn consume_name_character(&mut self) -> bool {
194 self.consume_if(Reader::check_name_character)
195 }
196
197 pub fn consume_name_start_character(&mut self) -> bool {
198 self.consume_if(Reader::check_name_start_character)
199 }
200
201 pub fn consume_number(&mut self) -> bool {
203 self.consume_sign();
204 if self.consume_digits() {
205 if self.consume_char('.') && !self.consume_digits() {
206 return false;
207 }
208 } else if !self.consume_char('.') || !self.consume_digits() {
209 return false;
210 }
211 if !self.consume_char('e') && !self.consume_char('E') {
212 return true;
213 }
214 self.consume_sign();
215 self.consume_digits()
216 }
217
218 pub fn consume_reference(&mut self) -> bool {
220 self.consume_char('&')
221 && if self.consume_char('#') {
222 if self.consume_char('x') {
223 self.consume_digits_hex()
224 } else {
225 self.consume_digits()
226 }
227 } else {
228 self.consume_name()
229 }
230 && self.consume_char(';')
231 }
232
233 pub fn consume_sign(&mut self) -> bool {
234 self.consume_char('+') || self.consume_char('-')
235 }
236
237 pub fn consume_tag(&mut self) -> bool {
238 self.consume_char('<') && self.consume_until_char('>') && self.consume_char('>')
239 }
240
241 #[inline]
242 pub fn consume_until_any(&mut self, targets: &str) -> bool {
243 self.consume_while(|c| !targets.contains(c))
244 }
245
246 #[inline]
247 pub fn consume_until_char(&mut self, target: char) -> bool {
248 self.consume_while(|c| c != target)
249 }
250
251 pub fn consume_while<F>(&mut self, check: F) -> bool
252 where
253 F: Fn(char) -> bool,
254 {
255 let mut consumed = false;
256 while self.consume_if(&check) {
257 consumed = true;
258 }
259 consumed
260 }
261
262 #[inline]
264 pub fn consume_whitespace(&mut self) -> bool {
265 self.consume_any("\x20\x09\x0D\x0A")
266 }
267
268 #[inline]
269 pub fn is_done(&self) -> bool {
270 self.offset == self.content.len()
271 }
272
273 #[inline]
274 pub fn peek(&mut self) -> Option<char> {
275 self.cursor.peek().copied()
276 }
277
278 #[inline]
279 pub fn peek_many(&self) -> Chars<'l> {
280 self.content[self.offset..].chars()
281 }
282
283 #[inline]
284 pub fn position(&self) -> (usize, usize) {
285 (self.line, self.column)
286 }
287
288 fn check_character(target: char) -> bool {
290 matches!(
291 target,
292 '\u{9}'
293 | '\u{A}'
294 | '\u{D}'
295 | '\u{20}'..='\u{D7FF}'
296 | '\u{E000}'..='\u{FFFD}'
297 | '\u{10000}'..='\u{10FFFF}',
298 )
299 }
300
301 fn check_name_character(target: char) -> bool {
303 if Reader::check_name_start_character(target) {
304 return true;
305 }
306 matches!(
307 target,
308 '-'
309 | '.'
310 | '0'..='9'
311 | '\u{B7}'
312 | '\u{0300}'..='\u{036F}'
313 | '\u{203F}'..='\u{2040}',
314 )
315 }
316
317 fn check_name_start_character(target: char) -> bool {
319 matches!(
320 target,
321 ':'
322 | 'A'..='Z'
323 | '_'
324 | 'a'..='z'
325 | '\u{C0}'..='\u{D6}'
326 | '\u{D8}'..='\u{F6}'
327 | '\u{F8}'..='\u{2FF}'
328 | '\u{370}'..='\u{37D}'
329 | '\u{37F}'..='\u{1FFF}'
330 | '\u{200C}'..='\u{200D}'
331 | '\u{2070}'..='\u{218F}'
332 | '\u{2C00}'..='\u{2FEF}'
333 | '\u{3001}'..='\u{D7FF}'
334 | '\u{F900}'..='\u{FDCF}'
335 | '\u{FDF0}'..='\u{FFFD}'
336 | '\u{10000}'..='\u{EFFFF}',
337 )
338 }
339}
340
341impl<'l> Iterator for Reader<'l> {
342 type Item = char;
343
344 fn next(&mut self) -> Option<char> {
345 match self.cursor.next() {
346 Some(c) => {
347 if c == '\n' {
348 self.line += 1;
349 self.column = 1;
350 } else {
351 self.column += 1;
352 }
353 self.offset += c.len_utf8();
354 Some(c)
355 }
356 _ => None,
357 }
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use super::Reader;
364
365 #[test]
366 fn capture() {
367 let mut reader = Reader::new("abcdefg");
368
369 assert!(reader.consume_any("ab"));
370
371 let content = reader.capture(|reader| reader.consume_any("cde"));
372
373 assert_eq!(content.unwrap(), "cde");
374 }
375
376 #[test]
377 fn consume_attribute() {
378 macro_rules! test(
379 ($content:expr) => ({
380 let mut reader = Reader::new($content);
381 assert!(reader.consume_attribute());
382 });
383 );
384
385 test!("foo='bar'");
386 test!("foo = \t 'bar'");
387 test!("foo= \"bar\"");
388 test!("標籤='數值'");
389 test!("foo='&bar;'");
390 test!("foo='bar &buz;'");
391 test!("foo='bar &buz; qux'");
392
393 macro_rules! test(
394 ($content:expr) => ({
395 let mut reader = Reader::new($content);
396 assert!(!reader.consume_attribute());
397 });
398 );
399
400 test!("foo");
401 test!("foo bar");
402 test!("foo=bar");
403 test!("foo='bar");
404 test!("foo=\"bar");
405 test!("foo='&bar'");
406 test!("foo='bar &bar'");
407 test!("foo='bar &bar qux'");
408 }
409
410 #[test]
411 fn consume_comment() {
412 macro_rules! test(
413 ($content:expr, $value:expr) => ({
414 let mut reader = Reader::new($content);
415 let value = reader.capture(|reader| reader.consume_comment());
416 assert_eq!(value.unwrap(), $value);
417 });
418 );
419
420 test!("<!-- foo --> bar", "<!-- foo -->");
421 test!("<!-- foo > --> bar", "<!-- foo > -->");
422
423 macro_rules! test(
424 ($content:expr) => ({
425 let mut reader = Reader::new($content);
426 assert!(!reader.consume_comment());
427 });
428 );
429
430 test!("<!-- B+, B, or B--->");
432 }
433
434 #[test]
435 fn consume_name() {
436 macro_rules! test(
437 ($content:expr, $value:expr) => ({
438 let mut reader = Reader::new($content);
439 let value = reader.capture(|reader| reader.consume_name());
440 assert_eq!(value.unwrap(), $value);
441 });
442 );
443
444 test!("foo", "foo");
445 test!("foo bar", "foo");
446 test!("foo42 bar", "foo42");
447 test!("foo-bar baz", "foo-bar");
448 test!("foo/", "foo");
449
450 macro_rules! test(
451 ($content:expr) => ({
452 let mut reader = Reader::new($content);
453 assert!(!reader.consume_name());
454 });
455 );
456
457 test!(" foo");
458 test!("!foo");
459 test!("<foo");
460 test!("?foo");
461 }
462
463 #[test]
464 fn consume_number() {
465 macro_rules! test(
466 ($content:expr, $value:expr) => ({
467 let mut reader = Reader::new($content);
468 let value = reader.capture(|reader| reader.consume_number());
469 assert_eq!(value.unwrap(), $value);
470 });
471 );
472
473 test!("1 ", "1");
474 test!("1a", "1");
475
476 test!("1", "1");
477 test!("-1", "-1");
478 test!("+1", "+1");
479
480 test!(".1", ".1");
481 test!("-.1", "-.1");
482 test!("+.1", "+.1");
483
484 test!("1.2", "1.2");
485 test!("-1.2", "-1.2");
486 test!("+1.2", "+1.2");
487
488 test!("1E2", "1E2");
489 test!("-1e2", "-1e2");
490 test!("+1e2", "+1e2");
491
492 test!("1.2e3", "1.2e3");
493 test!("-1.2E3", "-1.2E3");
494 test!("+1.2e3", "+1.2e3");
495
496 test!("1.2e-3", "1.2e-3");
497 test!("-1.2e-3", "-1.2e-3");
498 test!("+1.2E-3", "+1.2E-3");
499
500 test!("1.2E+3", "1.2E+3");
501 test!("-1.2e+3", "-1.2e+3");
502 test!("+1.2e+3", "+1.2e+3");
503
504 macro_rules! test(
505 ($content:expr) => ({
506 let mut reader = Reader::new($content);
507 assert!(reader.capture(|reader| reader.consume_number()).is_none());
508 });
509 );
510
511 test!("1.e2");
512 test!("-1.e2");
513 test!("+1.e2");
514 }
515
516 #[test]
517 fn consume_reference() {
518 macro_rules! test(
519 ($content:expr, $value:expr) => ({
520 let mut reader = Reader::new($content);
521 let value = reader.capture(|reader| reader.consume_reference());
522 assert_eq!(value.unwrap(), $value);
523 });
524 );
525
526 test!("* foo", "*");
527 test!("䊫 foo", "䊫");
528 test!("&foo; bar", "&foo;");
529
530 macro_rules! test(
531 ($content:expr) => ({
532 let mut reader = Reader::new($content);
533 assert!(!reader.consume_reference());
534 });
535 );
536
537 test!(" * foo");
538 test!("#42; foo");
539 test!("&42; foo");
540 test!("* foo");
541 test!("Bz; foo");
542 test!("&foo bar");
543 test!("foo; bar");
544 }
545
546 #[test]
547 fn consume_whitespace() {
548 let mut reader = Reader::new(" \t \n\n \tm ");
549 reader.consume_whitespace();
550
551 assert_eq!(reader.line, 3);
552 assert_eq!(reader.column, 4);
553 assert_eq!(reader.offset, 9);
554 }
555}