stem_rs/descriptor/
tordnsel.rs

1//! TorDNSEL exit list parsing.
2//!
3//! This module parses exit list files from [TorDNSEL](https://www.torproject.org/projects/tordnsel.html.en)
4//! (Tor DNS-based Exit List). These files contain information about Tor exit
5//! nodes and the IP addresses they use when exiting to the internet.
6//!
7//! # Overview
8//!
9//! TorDNSEL is a service that tracks which IP addresses are used by Tor exit
10//! nodes. This information is useful for:
11//!
12//! - Identifying traffic originating from Tor exit nodes
13//! - Implementing access controls based on Tor usage
14//! - Research and analysis of the Tor network
15//!
16//! Exit lists are published periodically and contain entries for each known
17//! exit relay, including:
18//! - The relay's fingerprint (identity)
19//! - When the relay was last seen in the consensus
20//! - The IP addresses the relay uses for exiting
21//!
22//! # File Format
23//!
24//! Exit list files follow this format:
25//!
26//! ```text
27//! @type tordnsel 1.0
28//! Downloaded 2024-01-01 00:00:00
29//! ExitNode <40 hex fingerprint>
30//! Published <YYYY-MM-DD HH:MM:SS>
31//! LastStatus <YYYY-MM-DD HH:MM:SS>
32//! ExitAddress <IPv4 address> <YYYY-MM-DD HH:MM:SS>
33//! ExitAddress <IPv4 address> <YYYY-MM-DD HH:MM:SS>
34//! ExitNode <40 hex fingerprint>
35//! ...
36//! ```
37//!
38//! # Example
39//!
40//! ```rust
41//! use stem_rs::descriptor::tordnsel::{TorDNSEL, parse_exit_list};
42//!
43//! let exit_list = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
44//! Published 2024-01-01 12:00:00
45//! LastStatus 2024-01-01 13:00:00
46//! ExitAddress 192.168.1.1 2024-01-01 13:30:00
47//! ExitNode 00FF300624FECA7F40515C8D854EE925332580D6
48//! Published 2024-01-01 11:00:00
49//! LastStatus 2024-01-01 12:00:00
50//! ExitAddress 10.0.0.1 2024-01-01 12:30:00
51//! "#;
52//!
53//! let entries = parse_exit_list(exit_list)?;
54//! assert_eq!(entries.len(), 2);
55//!
56//! for entry in &entries {
57//!     println!("Exit node: {}", entry.fingerprint);
58//!     for (addr, date) in &entry.exit_addresses {
59//!         println!("  Exit address: {} (seen {})", addr, date);
60//!     }
61//! }
62//! # Ok::<(), stem_rs::Error>(())
63//! ```
64//!
65//! # Data Source
66//!
67//! Exit lists can be obtained from:
68//! - [Tor Metrics](https://metrics.torproject.org/collector.html) - Historical data
69//! - [CollecTor](https://collector.torproject.org/) - Archive of Tor network data
70//!
71//! # See Also
72//!
73//! - [`server`](super::server): Server descriptors with full relay information
74//! - [`consensus`](super::consensus): Network status documents
75//! - [`remote`](super::remote): Downloading descriptors from the network
76
77use crate::Error;
78use chrono::{DateTime, NaiveDateTime, Utc};
79use std::net::Ipv4Addr;
80
81/// A TorDNSEL exit list entry for a single relay.
82///
83/// Each entry represents one Tor exit relay and contains information about
84/// when it was last seen and what IP addresses it uses for exiting traffic.
85///
86/// # Structure
87///
88/// A TorDNSEL entry contains:
89/// - The relay's fingerprint (40-character hex string)
90/// - Publication and last-seen timestamps
91/// - One or more exit addresses with observation times
92///
93/// # Exit Addresses
94///
95/// A relay may have multiple exit addresses because:
96/// - It may use different addresses for different exit ports
97/// - It may have changed addresses over time
98/// - It may be multi-homed (multiple network interfaces)
99///
100/// Each exit address is paired with the time it was observed being used.
101///
102/// # Example
103///
104/// ```rust
105/// use stem_rs::descriptor::tordnsel::TorDNSEL;
106///
107/// let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
108/// Published 2024-01-01 12:00:00
109/// LastStatus 2024-01-01 13:00:00
110/// ExitAddress 192.168.1.1 2024-01-01 13:30:00
111/// "#;
112///
113/// let entry = TorDNSEL::parse(content)?;
114/// assert_eq!(entry.fingerprint, "003A71137D959748C8157C4A76ECA639CEF5E33E");
115/// assert_eq!(entry.exit_addresses.len(), 1);
116/// # Ok::<(), stem_rs::Error>(())
117/// ```
118#[derive(Debug, Clone)]
119pub struct TorDNSEL {
120    /// SHA-1 fingerprint of the relay's identity key.
121    ///
122    /// This is a 40-character hexadecimal string that uniquely identifies
123    /// the relay. It matches the fingerprint in server descriptors and
124    /// consensus documents.
125    pub fingerprint: String,
126
127    /// Time when the relay published its descriptor.
128    ///
129    /// This indicates when the relay last updated its server descriptor.
130    /// May be `None` if the field was missing or unparseable.
131    pub published: Option<DateTime<Utc>>,
132
133    /// Time when the relay was last seen in a network status.
134    ///
135    /// This indicates when the relay was last included in a consensus
136    /// document. A relay not seen recently may no longer be active.
137    /// May be `None` if the field was missing or unparseable.
138    pub last_status: Option<DateTime<Utc>>,
139
140    /// List of exit addresses observed for this relay.
141    ///
142    /// Each entry is a tuple of (IPv4 address, observation time).
143    /// The observation time indicates when TorDNSEL detected that
144    /// the relay was using this address for exit traffic.
145    ///
146    /// A relay may have multiple exit addresses if it uses different
147    /// addresses for different connections or has changed addresses.
148    pub exit_addresses: Vec<(Ipv4Addr, DateTime<Utc>)>,
149
150    /// Raw bytes of the original entry content.
151    raw_content: Vec<u8>,
152
153    /// Lines that were not recognized during parsing.
154    unrecognized_lines: Vec<String>,
155}
156
157impl TorDNSEL {
158    /// Parses a TorDNSEL entry from a string.
159    ///
160    /// This method parses a single exit list entry containing information
161    /// about one relay.
162    ///
163    /// # Arguments
164    ///
165    /// * `content` - The entry content as a string
166    ///
167    /// # Returns
168    ///
169    /// A parsed `TorDNSEL` entry on success.
170    ///
171    /// # Errors
172    ///
173    /// Returns [`Error::Parse`] if:
174    /// - The `ExitNode` line is missing
175    /// - The fingerprint is not a valid 40-character hex string
176    ///
177    /// Note: Invalid timestamps are silently ignored rather than causing errors.
178    ///
179    /// # Example
180    ///
181    /// ```rust
182    /// use stem_rs::descriptor::tordnsel::TorDNSEL;
183    ///
184    /// let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
185    /// Published 2024-01-01 12:00:00
186    /// ExitAddress 192.168.1.1 2024-01-01 13:30:00
187    /// "#;
188    ///
189    /// let entry = TorDNSEL::parse(content)?;
190    /// assert_eq!(entry.fingerprint, "003A71137D959748C8157C4A76ECA639CEF5E33E");
191    /// # Ok::<(), stem_rs::Error>(())
192    /// ```
193    pub fn parse(content: &str) -> Result<Self, Error> {
194        Self::parse_bytes(content.as_bytes())
195    }
196
197    /// Parses a TorDNSEL entry from raw bytes.
198    ///
199    /// This is the byte-oriented version of [`parse()`](Self::parse),
200    /// useful when reading directly from files or network streams.
201    ///
202    /// # Arguments
203    ///
204    /// * `content` - The entry content as bytes (UTF-8 encoded)
205    ///
206    /// # Returns
207    ///
208    /// A parsed `TorDNSEL` entry on success.
209    ///
210    /// # Errors
211    ///
212    /// Returns [`Error::Parse`] if:
213    /// - The `ExitNode` line is missing
214    /// - The fingerprint is not a valid 40-character hex string
215    ///
216    /// # Note
217    ///
218    /// Invalid UTF-8 sequences are replaced with the Unicode replacement
219    /// character (U+FFFD) rather than causing an error.
220    pub fn parse_bytes(content: &[u8]) -> Result<Self, Error> {
221        let content_str = String::from_utf8_lossy(content);
222        let mut fingerprint = None;
223        let mut published = None;
224        let mut last_status = None;
225        let mut exit_addresses = Vec::new();
226        let mut unrecognized_lines = Vec::new();
227
228        for line in content_str.lines() {
229            let line = line.trim();
230            if line.is_empty() || line.starts_with('@') {
231                continue;
232            }
233
234            if let Some(value) = line.strip_prefix("ExitNode ") {
235                let fp = value.trim();
236                if !is_valid_fingerprint(fp) {
237                    return Err(Error::Parse {
238                        location: "tordnsel".into(),
239                        reason: format!(
240                            "Tor relay fingerprints consist of forty hex digits: {}",
241                            fp
242                        ),
243                    });
244                }
245                fingerprint = Some(fp.to_string());
246            } else if let Some(value) = line.strip_prefix("Published ") {
247                published = parse_timestamp(value.trim());
248            } else if let Some(value) = line.strip_prefix("LastStatus ") {
249                last_status = parse_timestamp(value.trim());
250            } else if let Some(value) = line.strip_prefix("ExitAddress ") {
251                if let Some((addr_str, date_str)) = value.split_once(' ') {
252                    if let Ok(addr) = addr_str.trim().parse::<Ipv4Addr>() {
253                        if let Some(date) = parse_timestamp(date_str.trim()) {
254                            exit_addresses.push((addr, date));
255                        }
256                    }
257                }
258            } else if !line.starts_with("Downloaded ") {
259                unrecognized_lines.push(line.to_string());
260            }
261        }
262
263        let fingerprint = fingerprint.ok_or_else(|| Error::Parse {
264            location: "tordnsel".into(),
265            reason: "Missing ExitNode fingerprint".into(),
266        })?;
267
268        Ok(Self {
269            fingerprint,
270            published,
271            last_status,
272            exit_addresses,
273            raw_content: content.to_vec(),
274            unrecognized_lines,
275        })
276    }
277
278    /// Returns the raw bytes of the original entry content.
279    ///
280    /// This provides access to the exact bytes that were parsed,
281    /// useful for debugging or storing entries in their original format.
282    ///
283    /// # Returns
284    ///
285    /// A byte slice containing the original entry content.
286    pub fn raw_content(&self) -> &[u8] {
287        &self.raw_content
288    }
289
290    /// Returns lines that were not recognized during parsing.
291    ///
292    /// Unrecognized lines are preserved for forward compatibility
293    /// with future exit list format extensions.
294    ///
295    /// # Returns
296    ///
297    /// A slice of strings, each representing an unrecognized line.
298    /// Empty if all lines were recognized.
299    pub fn unrecognized_lines(&self) -> &[String] {
300        &self.unrecognized_lines
301    }
302
303    /// Converts the entry back to its string representation.
304    ///
305    /// This produces a string in the standard TorDNSEL format
306    /// that can be parsed again or written to a file.
307    ///
308    /// # Returns
309    ///
310    /// A string containing the entry in standard format.
311    ///
312    /// # Example
313    ///
314    /// ```rust
315    /// use stem_rs::descriptor::tordnsel::TorDNSEL;
316    ///
317    /// let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
318    /// Published 2024-01-01 12:00:00
319    /// ExitAddress 192.168.1.1 2024-01-01 13:30:00
320    /// "#;
321    ///
322    /// let entry = TorDNSEL::parse(content)?;
323    /// let output = entry.to_descriptor_string();
324    /// assert!(output.contains("ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E"));
325    /// # Ok::<(), stem_rs::Error>(())
326    /// ```
327    pub fn to_descriptor_string(&self) -> String {
328        let mut lines = Vec::new();
329        lines.push(format!("ExitNode {}", self.fingerprint));
330        if let Some(dt) = self.published {
331            lines.push(format!("Published {}", dt.format("%Y-%m-%d %H:%M:%S")));
332        }
333        if let Some(dt) = self.last_status {
334            lines.push(format!("LastStatus {}", dt.format("%Y-%m-%d %H:%M:%S")));
335        }
336        for (addr, date) in &self.exit_addresses {
337            lines.push(format!(
338                "ExitAddress {} {}",
339                addr,
340                date.format("%Y-%m-%d %H:%M:%S")
341            ));
342        }
343        lines.join("\n")
344    }
345}
346
347/// Parses a complete TorDNSEL exit list file.
348///
349/// This function parses a file containing multiple exit list entries,
350/// returning a vector of all entries found.
351///
352/// # Arguments
353///
354/// * `content` - The complete exit list file content as a string
355///
356/// # Returns
357///
358/// A vector of parsed [`TorDNSEL`] entries on success.
359///
360/// # Errors
361///
362/// Returns [`Error::Parse`] if any entry in the file is malformed.
363/// Parsing stops at the first error.
364///
365/// # Example
366///
367/// ```rust
368/// use stem_rs::descriptor::tordnsel::parse_exit_list;
369///
370/// let exit_list = r#"@type tordnsel 1.0
371/// Downloaded 2024-01-01 00:00:00
372/// ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
373/// Published 2024-01-01 12:00:00
374/// ExitAddress 192.168.1.1 2024-01-01 13:30:00
375/// ExitNode 00FF300624FECA7F40515C8D854EE925332580D6
376/// Published 2024-01-01 11:00:00
377/// ExitAddress 10.0.0.1 2024-01-01 12:30:00
378/// "#;
379///
380/// let entries = parse_exit_list(exit_list)?;
381/// assert_eq!(entries.len(), 2);
382/// # Ok::<(), stem_rs::Error>(())
383/// ```
384pub fn parse_exit_list(content: &str) -> Result<Vec<TorDNSEL>, Error> {
385    parse_exit_list_bytes(content.as_bytes())
386}
387
388/// Parses a complete TorDNSEL exit list file from raw bytes.
389///
390/// This is the byte-oriented version of [`parse_exit_list()`],
391/// useful when reading directly from files or network streams.
392///
393/// # Arguments
394///
395/// * `content` - The complete exit list file content as bytes (UTF-8 encoded)
396///
397/// # Returns
398///
399/// A vector of parsed [`TorDNSEL`] entries on success.
400///
401/// # Errors
402///
403/// Returns [`Error::Parse`] if any entry in the file is malformed.
404/// Parsing stops at the first error.
405///
406/// # Note
407///
408/// Invalid UTF-8 sequences are replaced with the Unicode replacement
409/// character (U+FFFD) rather than causing an error.
410pub fn parse_exit_list_bytes(content: &[u8]) -> Result<Vec<TorDNSEL>, Error> {
411    let content_str = String::from_utf8_lossy(content);
412    let mut entries = Vec::new();
413    let mut current_entry = Vec::new();
414    let mut in_entry = false;
415
416    for line in content_str.lines() {
417        let trimmed = line.trim();
418        if trimmed.starts_with("ExitNode ") {
419            if in_entry && !current_entry.is_empty() {
420                let entry_content = current_entry.join("\n");
421                entries.push(TorDNSEL::parse(&entry_content)?);
422                current_entry.clear();
423            }
424            in_entry = true;
425        }
426        if in_entry {
427            current_entry.push(line.to_string());
428        }
429    }
430
431    if !current_entry.is_empty() {
432        let entry_content = current_entry.join("\n");
433        entries.push(TorDNSEL::parse(&entry_content)?);
434    }
435
436    Ok(entries)
437}
438
439/// Validates a fingerprint string.
440///
441/// A valid fingerprint is exactly 40 hexadecimal characters (case-insensitive).
442fn is_valid_fingerprint(fp: &str) -> bool {
443    fp.len() == 40 && fp.chars().all(|c| c.is_ascii_hexdigit())
444}
445
446/// Parses a timestamp string in Tor's standard format.
447///
448/// Expected format: "YYYY-MM-DD HH:MM:SS"
449fn parse_timestamp(s: &str) -> Option<DateTime<Utc>> {
450    NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S")
451        .ok()
452        .map(|dt| dt.and_utc())
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    const TEST_DESC: &str = r#"@type tordnsel 1.0
460Downloaded 2013-08-19 04:02:03
461ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
462Published 2013-08-19 02:13:53
463LastStatus 2013-08-19 03:02:47
464ExitAddress 66.223.170.168 2013-08-19 03:18:51
465ExitNode 00FF300624FECA7F40515C8D854EE925332580D6
466Published 2013-08-18 07:02:14
467LastStatus 2013-08-18 09:02:58
468ExitAddress 82.252.181.153 2013-08-18 08:03:01
469ExitAddress 82.252.181.154 2013-08-18 08:03:02
470ExitAddress 82.252.181.155 2013-08-18 08:03:03
471ExitNode 030B22437D99B2DB2908B747B6962EAD13AB4039
472Published 2013-08-18 12:44:20
473LastStatus 2013-08-18 13:02:57
474ExitAddress 46.10.211.205 2013-08-18 13:18:48
475"#;
476
477    #[test]
478    fn test_parse_exit_list() {
479        let entries = parse_exit_list(TEST_DESC).unwrap();
480        assert_eq!(entries.len(), 3);
481
482        let desc = &entries[1];
483        assert_eq!(desc.fingerprint, "00FF300624FECA7F40515C8D854EE925332580D6");
484        assert!(desc.published.is_some());
485        assert!(desc.last_status.is_some());
486        assert_eq!(desc.exit_addresses.len(), 3);
487
488        let (addr, _date) = &desc.exit_addresses[0];
489        assert_eq!(*addr, "82.252.181.153".parse::<Ipv4Addr>().unwrap());
490    }
491
492    #[test]
493    fn test_parse_single_entry() {
494        let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
495Published 2013-08-19 02:13:53
496LastStatus 2013-08-19 03:02:47
497ExitAddress 66.223.170.168 2013-08-19 03:18:51"#;
498
499        let entry = TorDNSEL::parse(content).unwrap();
500        assert_eq!(
501            entry.fingerprint,
502            "003A71137D959748C8157C4A76ECA639CEF5E33E"
503        );
504        assert_eq!(entry.exit_addresses.len(), 1);
505    }
506
507    #[test]
508    fn test_invalid_fingerprint() {
509        let content = "ExitNode 030B22437D99B2DB2908B747B6";
510        let result = TorDNSEL::parse(content);
511        assert!(result.is_err());
512    }
513
514    #[test]
515    fn test_missing_fingerprint() {
516        let content = r#"Published 2013-08-19 02:13:53
517ExitAddress 66.223.170.168 2013-08-19 03:18:51"#;
518        let result = TorDNSEL::parse(content);
519        assert!(result.is_err());
520    }
521
522    #[test]
523    fn test_malformed_date_skipped() {
524        let content = r#"ExitNode 030B22437D99B2DB2908B747B6962EAD13AB4038
525Published Today!
526LastStatus 2013-08-18 13:02:57
527ExitAddress 46.10.211.205 Never"#;
528
529        let entry = TorDNSEL::parse(content).unwrap();
530        assert_eq!(
531            entry.fingerprint,
532            "030B22437D99B2DB2908B747B6962EAD13AB4038"
533        );
534        assert!(entry.published.is_none());
535        assert_eq!(entry.exit_addresses.len(), 0);
536    }
537
538    #[test]
539    fn test_to_descriptor_string() {
540        let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
541Published 2013-08-19 02:13:53
542LastStatus 2013-08-19 03:02:47
543ExitAddress 66.223.170.168 2013-08-19 03:18:51"#;
544
545        let entry = TorDNSEL::parse(content).unwrap();
546        let output = entry.to_descriptor_string();
547        assert!(output.contains("ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E"));
548        assert!(output.contains("Published 2013-08-19 02:13:53"));
549        assert!(output.contains("ExitAddress 66.223.170.168"));
550    }
551
552    #[test]
553    fn test_parse_file_assertions() {
554        let entries = parse_exit_list(TEST_DESC).unwrap();
555        assert_eq!(entries.len(), 3);
556
557        let desc = &entries[1];
558        assert_eq!(desc.fingerprint, "00FF300624FECA7F40515C8D854EE925332580D6");
559        assert!(desc.published.is_some());
560        assert!(desc.last_status.is_some());
561        assert_eq!(desc.exit_addresses.len(), 3);
562
563        let (addr, date) = &desc.exit_addresses[0];
564        assert_eq!(*addr, "82.252.181.153".parse::<Ipv4Addr>().unwrap());
565        assert!(date.format("%Y-%m-%d %H:%M:%S").to_string() == "2013-08-18 08:03:01");
566    }
567
568    #[test]
569    fn test_multiple_exit_addresses() {
570        let content = r#"ExitNode 00FF300624FECA7F40515C8D854EE925332580D6
571Published 2013-08-18 07:02:14
572LastStatus 2013-08-18 09:02:58
573ExitAddress 82.252.181.153 2013-08-18 08:03:01
574ExitAddress 82.252.181.154 2013-08-18 08:03:02
575ExitAddress 82.252.181.155 2013-08-18 08:03:03"#;
576
577        let entry = TorDNSEL::parse(content).unwrap();
578        assert_eq!(entry.exit_addresses.len(), 3);
579        assert_eq!(entry.exit_addresses[0].0.to_string(), "82.252.181.153");
580        assert_eq!(entry.exit_addresses[1].0.to_string(), "82.252.181.154");
581        assert_eq!(entry.exit_addresses[2].0.to_string(), "82.252.181.155");
582    }
583
584    #[test]
585    fn test_unrecognized_lines() {
586        let content = r#"ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
587Published 2013-08-19 02:13:53
588UnknownField some value
589ExitAddress 66.223.170.168 2013-08-19 03:18:51"#;
590
591        let entry = TorDNSEL::parse(content).unwrap();
592        assert_eq!(entry.unrecognized_lines(), &["UnknownField some value"]);
593    }
594
595    #[test]
596    fn test_type_annotation_ignored() {
597        let content = r#"@type tordnsel 1.0
598ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
599Published 2013-08-19 02:13:53"#;
600
601        let entry = TorDNSEL::parse(content).unwrap();
602        assert_eq!(
603            entry.fingerprint,
604            "003A71137D959748C8157C4A76ECA639CEF5E33E"
605        );
606    }
607
608    #[test]
609    fn test_downloaded_line_ignored() {
610        let content = r#"Downloaded 2013-08-19 04:02:03
611ExitNode 003A71137D959748C8157C4A76ECA639CEF5E33E
612Published 2013-08-19 02:13:53"#;
613
614        let entry = TorDNSEL::parse(content).unwrap();
615        assert_eq!(
616            entry.fingerprint,
617            "003A71137D959748C8157C4A76ECA639CEF5E33E"
618        );
619        assert!(entry.unrecognized_lines().is_empty());
620    }
621}