svg/parser/
reader.rs

1use std::iter::Peekable;
2use std::str::Chars;
3
4pub struct Reader<'l> {
5    line: usize,
6    column: usize,
7    offset: usize,
8    content: &'l str,
9    cursor: Peekable<Chars<'l>>,
10}
11
12impl<'l> Reader<'l> {
13    #[inline]
14    pub fn new(content: &'l str) -> Self {
15        Reader {
16            line: 1,
17            column: 1,
18            offset: 0,
19            content,
20            cursor: content.chars().peekable(),
21        }
22    }
23
24    pub fn capture<F>(&mut self, block: F) -> Option<&'l str>
25    where
26        F: Fn(&mut Reader<'l>) -> bool,
27    {
28        let start = self.offset;
29        if !block(self) {
30            return None;
31        }
32        let content = &self.content[start..self.offset].trim();
33        if content.is_empty() {
34            None
35        } else {
36            Some(content)
37        }
38    }
39
40    #[inline]
41    pub fn consume_all(&mut self) -> bool {
42        self.consume_while(|_| true)
43    }
44
45    #[inline]
46    pub fn consume_any(&mut self, targets: &str) -> bool {
47        self.consume_while(|c| targets.contains(c))
48    }
49
50    // https://www.w3.org/TR/REC-xml/#NT-Attribute
51    pub fn consume_attribute(&mut self) -> bool {
52        self.consume_name() && self.consume_equality() && self.consume_attribute_value()
53    }
54
55    // https://www.w3.org/TR/REC-xml/#NT-AttValue
56    pub fn consume_attribute_value(&mut self) -> bool {
57        let single;
58        if self.consume_char('\'') {
59            single = true;
60        } else if self.consume_char('"') {
61            single = false;
62        } else {
63            return false;
64        }
65        loop {
66            self.consume_until_any(if single { "<&'" } else { "<&\"" });
67            match self.peek() {
68                Some('&') => {
69                    if !self.consume_reference() {
70                        return false;
71                    }
72                }
73                _ => break,
74            }
75        }
76        self.consume_char(if single { '\'' } else { '"' })
77    }
78
79    pub fn consume_char(&mut self, target: char) -> bool {
80        match self.peek() {
81            Some(c) if c == target => {
82                self.next();
83                true
84            }
85            _ => false,
86        }
87    }
88
89    pub fn consume_character(&mut self) -> bool {
90        self.consume_if(Reader::check_character)
91    }
92
93    // https://www.w3.org/TR/REC-xml/#sec-comments
94    pub fn consume_comment(&mut self) -> bool {
95        self.consume_char('<')
96            && self.consume_char('!')
97            && self.consume_char('-')
98            && self.consume_char('-')
99            && {
100                self.consume_comment_body();
101                true
102            }
103            && self.consume_char('-')
104            && self.consume_char('-')
105            && self.consume_char('>')
106    }
107
108    pub fn consume_comment_body(&mut self) -> bool {
109        let mut consumed = true;
110        while let Some(c) = self.peek() {
111            if c == '-' {
112                let mut iterator = self
113                    .peek_many()
114                    .take(2)
115                    .map(|c| c != '-' && Reader::check_character(c));
116                match (iterator.next(), iterator.next()) {
117                    (Some(false), Some(false)) => break,
118                    (Some(false), Some(true)) => {
119                        assert!(self.consume_char('-'));
120                        consumed = true;
121                    }
122                    (Some(false), None) => break,
123                    _ => unreachable!(),
124                }
125            } else if self.consume_character() {
126                consumed = true;
127            } else {
128                break;
129            }
130        }
131        consumed
132    }
133
134    pub fn consume_declaration(&mut self) -> bool {
135        self.consume_char('<')
136            && self.consume_char('!')
137            && self.consume_until_char('>')
138            && self.consume_char('>')
139    }
140
141    #[inline]
142    pub fn consume_digits(&mut self) -> bool {
143        self.consume_while(|c| c.is_ascii_digit())
144    }
145
146    #[inline]
147    pub fn consume_digits_hex(&mut self) -> bool {
148        self.consume_while(|c| {
149            c.is_ascii_digit() || ('a'..='f').contains(&c) || ('A'..='F').contains(&c)
150        })
151    }
152
153    // https://www.w3.org/TR/REC-xml/#NT-Eq
154    pub fn consume_equality(&mut self) -> bool {
155        self.consume_whitespace();
156        let consumed = self.consume_char('=');
157        self.consume_whitespace();
158        consumed
159    }
160
161    pub fn consume_if<F>(&mut self, check: F) -> bool
162    where
163        F: Fn(char) -> bool,
164    {
165        match self.peek() {
166            Some(c) => {
167                if check(c) {
168                    self.next();
169                    true
170                } else {
171                    false
172                }
173            }
174            _ => false,
175        }
176    }
177
178    pub fn consume_instruction(&mut self) -> bool {
179        self.consume_char('<')
180            && self.consume_char('?')
181            && self.consume_until_char('>')
182            && self.consume_char('>')
183    }
184
185    // https://www.w3.org/TR/REC-xml/#NT-Name
186    pub fn consume_name(&mut self) -> bool {
187        self.consume_name_start_character() && {
188            while self.consume_name_character() {}
189            true
190        }
191    }
192
193    pub fn consume_name_character(&mut self) -> bool {
194        self.consume_if(Reader::check_name_character)
195    }
196
197    pub fn consume_name_start_character(&mut self) -> bool {
198        self.consume_if(Reader::check_name_start_character)
199    }
200
201    // https://www.w3.org/TR/SVG/types.html#DataTypeNumber
202    pub fn consume_number(&mut self) -> bool {
203        self.consume_sign();
204        if self.consume_digits() {
205            if self.consume_char('.') && !self.consume_digits() {
206                return false;
207            }
208        } else if !self.consume_char('.') || !self.consume_digits() {
209            return false;
210        }
211        if !self.consume_char('e') && !self.consume_char('E') {
212            return true;
213        }
214        self.consume_sign();
215        self.consume_digits()
216    }
217
218    // https://www.w3.org/TR/REC-xml/#NT-Reference
219    pub fn consume_reference(&mut self) -> bool {
220        self.consume_char('&')
221            && if self.consume_char('#') {
222                if self.consume_char('x') {
223                    self.consume_digits_hex()
224                } else {
225                    self.consume_digits()
226                }
227            } else {
228                self.consume_name()
229            }
230            && self.consume_char(';')
231    }
232
233    pub fn consume_sign(&mut self) -> bool {
234        self.consume_char('+') || self.consume_char('-')
235    }
236
237    pub fn consume_tag(&mut self) -> bool {
238        self.consume_char('<') && self.consume_until_char('>') && self.consume_char('>')
239    }
240
241    #[inline]
242    pub fn consume_until_any(&mut self, targets: &str) -> bool {
243        self.consume_while(|c| !targets.contains(c))
244    }
245
246    #[inline]
247    pub fn consume_until_char(&mut self, target: char) -> bool {
248        self.consume_while(|c| c != target)
249    }
250
251    pub fn consume_while<F>(&mut self, check: F) -> bool
252    where
253        F: Fn(char) -> bool,
254    {
255        let mut consumed = false;
256        while self.consume_if(&check) {
257            consumed = true;
258        }
259        consumed
260    }
261
262    // https://www.w3.org/TR/REC-xml/#NT-S
263    #[inline]
264    pub fn consume_whitespace(&mut self) -> bool {
265        self.consume_any("\x20\x09\x0D\x0A")
266    }
267
268    #[inline]
269    pub fn is_done(&self) -> bool {
270        self.offset == self.content.len()
271    }
272
273    #[inline]
274    pub fn peek(&mut self) -> Option<char> {
275        self.cursor.peek().copied()
276    }
277
278    #[inline]
279    pub fn peek_many(&self) -> Chars<'l> {
280        self.content[self.offset..].chars()
281    }
282
283    #[inline]
284    pub fn position(&self) -> (usize, usize) {
285        (self.line, self.column)
286    }
287
288    // https://www.w3.org/TR/REC-xml/#NT-Char
289    fn check_character(target: char) -> bool {
290        matches!(
291            target,
292            '\u{9}'
293            | '\u{A}'
294            | '\u{D}'
295            | '\u{20}'..='\u{D7FF}'
296            | '\u{E000}'..='\u{FFFD}'
297            | '\u{10000}'..='\u{10FFFF}',
298        )
299    }
300
301    // https://www.w3.org/TR/REC-xml/#NT-NameChar
302    fn check_name_character(target: char) -> bool {
303        if Reader::check_name_start_character(target) {
304            return true;
305        }
306        matches!(
307            target,
308            '-'
309            | '.'
310            | '0'..='9'
311            | '\u{B7}'
312            | '\u{0300}'..='\u{036F}'
313            | '\u{203F}'..='\u{2040}',
314        )
315    }
316
317    // https://www.w3.org/TR/REC-xml/#NT-NameStartChar
318    fn check_name_start_character(target: char) -> bool {
319        matches!(
320            target,
321            ':'
322            | 'A'..='Z'
323            | '_'
324            | 'a'..='z'
325            | '\u{C0}'..='\u{D6}'
326            | '\u{D8}'..='\u{F6}'
327            | '\u{F8}'..='\u{2FF}'
328            | '\u{370}'..='\u{37D}'
329            | '\u{37F}'..='\u{1FFF}'
330            | '\u{200C}'..='\u{200D}'
331            | '\u{2070}'..='\u{218F}'
332            | '\u{2C00}'..='\u{2FEF}'
333            | '\u{3001}'..='\u{D7FF}'
334            | '\u{F900}'..='\u{FDCF}'
335            | '\u{FDF0}'..='\u{FFFD}'
336            | '\u{10000}'..='\u{EFFFF}',
337        )
338    }
339}
340
341impl<'l> Iterator for Reader<'l> {
342    type Item = char;
343
344    fn next(&mut self) -> Option<char> {
345        match self.cursor.next() {
346            Some(c) => {
347                if c == '\n' {
348                    self.line += 1;
349                    self.column = 1;
350                } else {
351                    self.column += 1;
352                }
353                self.offset += c.len_utf8();
354                Some(c)
355            }
356            _ => None,
357        }
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::Reader;
364
365    #[test]
366    fn capture() {
367        let mut reader = Reader::new("abcdefg");
368
369        assert!(reader.consume_any("ab"));
370
371        let content = reader.capture(|reader| reader.consume_any("cde"));
372
373        assert_eq!(content.unwrap(), "cde");
374    }
375
376    #[test]
377    fn consume_attribute() {
378        macro_rules! test(
379            ($content:expr) => ({
380                let mut reader = Reader::new($content);
381                assert!(reader.consume_attribute());
382            });
383        );
384
385        test!("foo='bar'");
386        test!("foo = \t 'bar'");
387        test!("foo= \"bar\"");
388        test!("標籤='數值'");
389        test!("foo='&bar;'");
390        test!("foo='bar &buz;'");
391        test!("foo='bar &buz; qux'");
392
393        macro_rules! test(
394            ($content:expr) => ({
395                let mut reader = Reader::new($content);
396                assert!(!reader.consume_attribute());
397            });
398        );
399
400        test!("foo");
401        test!("foo bar");
402        test!("foo=bar");
403        test!("foo='bar");
404        test!("foo=\"bar");
405        test!("foo='&bar'");
406        test!("foo='bar &bar'");
407        test!("foo='bar &bar qux'");
408    }
409
410    #[test]
411    fn consume_comment() {
412        macro_rules! test(
413            ($content:expr, $value:expr) => ({
414                let mut reader = Reader::new($content);
415                let value = reader.capture(|reader| reader.consume_comment());
416                assert_eq!(value.unwrap(), $value);
417            });
418        );
419
420        test!("<!-- foo --> bar", "<!-- foo -->");
421        test!("<!-- foo > --> bar", "<!-- foo > -->");
422
423        macro_rules! test(
424            ($content:expr) => ({
425                let mut reader = Reader::new($content);
426                assert!(!reader.consume_comment());
427            });
428        );
429
430        // https://www.w3.org/TR/REC-xml/#sec-comments
431        test!("<!-- B+, B, or B--->");
432    }
433
434    #[test]
435    fn consume_name() {
436        macro_rules! test(
437            ($content:expr, $value:expr) => ({
438                let mut reader = Reader::new($content);
439                let value = reader.capture(|reader| reader.consume_name());
440                assert_eq!(value.unwrap(), $value);
441            });
442        );
443
444        test!("foo", "foo");
445        test!("foo bar", "foo");
446        test!("foo42 bar", "foo42");
447        test!("foo-bar baz", "foo-bar");
448        test!("foo/", "foo");
449
450        macro_rules! test(
451            ($content:expr) => ({
452                let mut reader = Reader::new($content);
453                assert!(!reader.consume_name());
454            });
455        );
456
457        test!(" foo");
458        test!("!foo");
459        test!("<foo");
460        test!("?foo");
461    }
462
463    #[test]
464    fn consume_number() {
465        macro_rules! test(
466            ($content:expr, $value:expr) => ({
467                let mut reader = Reader::new($content);
468                let value = reader.capture(|reader| reader.consume_number());
469                assert_eq!(value.unwrap(), $value);
470            });
471        );
472
473        test!("1 ", "1");
474        test!("1a", "1");
475
476        test!("1", "1");
477        test!("-1", "-1");
478        test!("+1", "+1");
479
480        test!(".1", ".1");
481        test!("-.1", "-.1");
482        test!("+.1", "+.1");
483
484        test!("1.2", "1.2");
485        test!("-1.2", "-1.2");
486        test!("+1.2", "+1.2");
487
488        test!("1E2", "1E2");
489        test!("-1e2", "-1e2");
490        test!("+1e2", "+1e2");
491
492        test!("1.2e3", "1.2e3");
493        test!("-1.2E3", "-1.2E3");
494        test!("+1.2e3", "+1.2e3");
495
496        test!("1.2e-3", "1.2e-3");
497        test!("-1.2e-3", "-1.2e-3");
498        test!("+1.2E-3", "+1.2E-3");
499
500        test!("1.2E+3", "1.2E+3");
501        test!("-1.2e+3", "-1.2e+3");
502        test!("+1.2e+3", "+1.2e+3");
503
504        macro_rules! test(
505            ($content:expr) => ({
506                let mut reader = Reader::new($content);
507                assert!(reader.capture(|reader| reader.consume_number()).is_none());
508            });
509        );
510
511        test!("1.e2");
512        test!("-1.e2");
513        test!("+1.e2");
514    }
515
516    #[test]
517    fn consume_reference() {
518        macro_rules! test(
519            ($content:expr, $value:expr) => ({
520                let mut reader = Reader::new($content);
521                let value = reader.capture(|reader| reader.consume_reference());
522                assert_eq!(value.unwrap(), $value);
523            });
524        );
525
526        test!("&#42; foo", "&#42;");
527        test!("&#x42aB; foo", "&#x42aB;");
528        test!("&foo; bar", "&foo;");
529
530        macro_rules! test(
531            ($content:expr) => ({
532                let mut reader = Reader::new($content);
533                assert!(!reader.consume_reference());
534            });
535        );
536
537        test!(" &#42; foo");
538        test!("#42; foo");
539        test!("&42; foo");
540        test!("&#42 foo");
541        test!("&#x42z; foo");
542        test!("&foo bar");
543        test!("foo; bar");
544    }
545
546    #[test]
547    fn consume_whitespace() {
548        let mut reader = Reader::new(" \t  \n\n  \tm ");
549        reader.consume_whitespace();
550
551        assert_eq!(reader.line, 3);
552        assert_eq!(reader.column, 4);
553        assert_eq!(reader.offset, 9);
554    }
555}