atom/id/
mod.rs

1//! # Atom Identification Constructs
2//!
3//! This module contains the foundational types and logic for working with Atom
4//! identifiers in a cryptographically secure namespace. This enables universal,
5//! collision-resistant addressing of software packages (atoms) across sources,
6//! with end-to-end integrity from origin to content.
7//!
8//! ## High-Level Vision
9//!
10//! The system is designed to create a layered, cryptographic address space for atoms,
11//! allowing unambiguous identification and retrieval across diverse repositories or
12//! sources. At a conceptual level, this involves:
13//! - An immutable **origin** identifier (e.g., a repository's root commit hash) to anchor the
14//!   namespace and ensure domain separation.
15//! - A human-readable **tag** (moniker) within that origin, validated for descriptiveness and
16//!   safety while enabling a vast Unicode-based character set within a single origin.
17//! - A machine-readable **id** combining the origin and tag into a globally unique identifier,
18//!   represented as a cryptographic **hash** derived from the components of the id using BLAKE3
19//!   (with the origin serving as a key in derivation), ensuring atoms with the same tag in
20//!   different origins are cryptographically distinct.
21//!
22//!
23//! These primitives, coupled with the rest of an atom's components, enable diverse and efficient
24//! tooling capable of unambigiously indexing, querying and addressing software packages with
25//! cryptographically sound provenance meta-data from origin, to package identifier, to specific
26//! versions and their contents (e.g. via git content hashes).
27//!
28//! ## Key Concepts
29//!
30//! **Atom Tags** are Unicode identifiers that descriptively label
31//! atoms within an origin. They are validated to ensure they contain only safe
32//! characters and contribute to a vast address space for cryptographic disambiguation.
33//!
34//! **Atom Ids** are the Rust struct coupling a tag to its origin, ultimately represented by the
35//! BLAKE3-derived hash these components, providing a cryptographically secure, collision-resistant,
36//! and stable identifier for the atom itself. This ensures disambiguation across origins without
37//! tying directly to version-specific content (which may be handled in higher layers).
38//!
39//! ## Tag Validation Rules
40//!
41//! Atom Tags are validated on construction to ensure they serve as descriptive identifiers while
42//! providing a vast character set per origin, suitable for use as the human-readable component of
43//! an atom's cryptographic identity. This allows for meaningful Unicode characters across languages
44//! (beyond just ASCII/English) without permitting nonsensical or overly permissive
45//! content. Validation leverages Unicode general categories for letters and numbers.
46//!
47//! Atom Tags must:
48//! - Be valid UTF-8 encoded Unicode strings
49//! - Not exceed 128 bytes in length (measured in UTF-8 bytes)
50//! - Not be empty
51//! - Start with a Unicode letter (general categories: UppercaseLetter [Lu], LowercaseLetter [Ll],
52//!   TitlecaseLetter [Lt], ModifierLetter [Lm], or OtherLetter [Lo]; not a number, underscore, or
53//!   hyphen)
54//! - Contain only Unicode letters (as defined above), Unicode numbers (DecimalNumber [Nd] or
55//!   LetterNumber [Nl]), hyphens (`-`), and underscores (`_`)
56//!
57//! ## Usage Example
58//!
59//! ```rust,no_run
60//! use atom::store::git::Root;
61//! use atom::{AtomId, AtomTag, Compute, Origin};
62//!
63//! // Create a validated atom tag
64//! let tag = AtomTag::try_from("my-atom").unwrap();
65//!
66//! // Create an AtomId with a Git origin
67//! let repo = gix::open(".").unwrap();
68//! let commit = repo
69//!     .rev_parse_single("HEAD")
70//!     .map(|s| repo.find_commit(s))
71//!     .unwrap()
72//!     .unwrap();
73//!
74//! let id = AtomId::construct(&commit, tag).unwrap();
75//!
76//! // Get the hash for disambiguated identification
77//! let hash = id.compute_hash();
78//! println!("Atom hash: {}", hash);
79//! ```
80#[cfg(test)]
81mod tests;
82
83use std::borrow::Borrow;
84use std::ffi::OsStr;
85use std::fmt;
86use std::ops::Deref;
87use std::str::FromStr;
88
89use serde::{Deserialize, Serialize, Serializer};
90use thiserror::Error;
91use unic_ucd_category::GeneralCategory;
92
93const ID_MAX: usize = 128;
94
95#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
96#[serde(try_from = "String")]
97/// A vetted String suitable for an atom's `tag` field
98pub struct AtomTag(String);
99
100#[derive(Error, Debug, PartialEq, Eq)]
101/// Errors that can occur during atom tag validation.
102///
103/// These errors represent various validation failures when creating or parsing
104/// atom identifiers, ensuring they conform to the required format and constraints
105/// for cryptographically secure identification.
106pub enum Error {
107    /// The atom identifier exceeds the maximum allowed length of 128 bytes.
108    ///
109    /// Atom tags are limited in size to ensure efficient storage and processing
110    /// while maintaining a sufficient character space for meaningful identifiers.
111    #[error("An Atom id cannot be more than {} bytes", ID_MAX)]
112    TooLong,
113
114    /// The atom identifier is empty.
115    ///
116    /// Empty identifiers are not allowed as they provide no meaningful
117    /// identification for the atom.
118    #[error("An Atom id cannot be empty")]
119    Empty,
120
121    /// The atom identifier starts with an invalid character.
122    ///
123    /// Atom identifiers must start with a Unicode letter (not numbers, hyphens,
124    /// underscores, or other non-letter characters) to ensure they are
125    /// descriptive and follow identifier conventions.
126    #[error("An Atom id cannot start with: '{0}'")]
127    InvalidStart(char),
128
129    /// The atom identifier contains invalid characters.
130    ///
131    /// Only Unicode letters, numbers, hyphens, and underscores are permitted
132    /// in atom identifiers. This ensures compatibility across different systems
133    /// and maintains readability.
134    #[error("The Atom id contains invalid characters: '{0}'")]
135    InvalidCharacters(String),
136
137    /// The atom identifier contains invalid Unicode.
138    ///
139    /// Atom identifiers must be valid UTF-8 encoded Unicode strings.
140    /// This error occurs when the string contains invalid byte sequences
141    /// or encoding issues.
142    #[error("An Atom id must be valid unicode")]
143    InvalidUnicode,
144}
145
146/// Trait for computing BLAKE3 hashes of AtomIds.
147///
148/// This trait is implemented for AtomId to provide a way to compute
149/// cryptographically secure hashes that can be used as unique identifiers
150/// for atoms in storage backends.
151pub trait Compute<'id, T>: Borrow<[u8]> {
152    /// Computes the BLAKE3 hash of this AtomId.
153    ///
154    /// The hash is computed using a key derived from the atom's root value,
155    /// ensuring that atoms with the same ID but different roots produce
156    /// different hashes.
157    ///
158    /// # Returns
159    ///
160    /// An `IdHash` containing the 32-byte BLAKE3 hash and a reference
161    /// to the original AtomId.
162    fn compute_hash(&'id self) -> IdHash<'id, T>;
163}
164
165/// This trait must be implemented to construct new instances of an an [`AtomId`].
166/// It tells the [`AtomId::construct`] constructor how to calculate the value for
167/// its `root` field.
168pub trait Origin<R> {
169    /// The error type returned by the [`Origin::calculate_origin`] method.
170    type Error;
171    /// The method used the calculate the root field for the [`AtomId`].
172    ///
173    /// # Errors
174    ///
175    /// This function will return an error if the calculation fails or is impossible.
176    fn calculate_origin(&self) -> Result<R, Self::Error>;
177}
178
179/// The type representing all the components necessary to serve as
180/// an unambiguous identifier. Atoms consist of a human-readable
181/// Unicode identifier, as well as a root field, which varies for
182/// each store implementation. For example, Git uses the oldest
183/// commit in a repositories history.
184#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
185pub struct AtomId<R> {
186    origin: R,
187    tag: AtomTag,
188}
189
190#[derive(Debug, Clone, PartialEq, Eq, Hash)]
191/// Represents the BLAKE3 hash of an AtomId.
192///
193/// This struct contains a 32-byte BLAKE3 hash that serves as a
194/// cryptographically secure, globally unique identifier for an atom.
195/// The hash is computed from the combination of the atom's human-readable
196/// ID and its context-specific origin value.
197pub struct IdHash<'id, T> {
198    /// The 32-byte BLAKE3 hash value
199    hash: [u8; 32],
200    /// Reference to the AtomId that was hashed
201    id: &'id AtomId<T>,
202}
203
204impl<R> Serialize for AtomId<R> {
205    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206    where
207        S: Serializer,
208    {
209        // Serialize only the `tag` field as a string
210        self.tag.serialize(serializer)
211    }
212}
213
214impl<T> Deref for IdHash<'_, T> {
215    type Target = [u8; 32];
216
217    fn deref(&self) -> &Self::Target {
218        &self.hash
219    }
220}
221
222impl<'id, R: AsRef<[u8]>> Compute<'id, R> for AtomId<R> {
223    fn compute_hash(&'id self) -> IdHash<'id, R> {
224        use blake3::Hasher;
225
226        let key = blake3::derive_key("AtomId", self.origin.as_ref());
227
228        let mut hasher = Hasher::new_keyed(&key);
229        hasher.update(self.tag.as_bytes());
230        IdHash {
231            hash: *hasher.finalize().as_bytes(),
232            id: self,
233        }
234    }
235}
236
237impl<T> Borrow<[u8]> for AtomId<T> {
238    fn borrow(&self) -> &[u8] {
239        self.tag.as_bytes()
240    }
241}
242
243impl<R> AtomId<R>
244where
245    for<'id> AtomId<R>: Compute<'id, R>,
246{
247    /// Compute an atom's origin and construct its ID. This method takes a `src`
248    /// type which must implement the [`Origin`] struct.
249    ///
250    /// # Errors
251    ///
252    /// This function will return an error if the call to
253    /// [`Origin::calculate_origin`] fails.
254    pub fn construct<T>(src: &T, tag: AtomTag) -> Result<Self, T::Error>
255    where
256        T: Origin<R>,
257    {
258        let origin = src.calculate_origin()?;
259        Ok(AtomId { origin, tag })
260    }
261
262    /// The root field, which serves as a derived key for the blake-3 hash used to
263    /// identify the Atom in backend implementations.
264    pub fn root(&self) -> &R {
265        &self.origin
266    }
267}
268
269pub(crate) const ROOT_TAG: &str = "__ROOT";
270
271/// For contexts where you want an identifier that is validated similarly to an AtomTag.
272pub type Name = AtomTag;
273
274impl AtomTag {
275    fn validate_start(c: char) -> Result<(), Error> {
276        if AtomTag::is_invalid_start(c) {
277            return Err(Error::InvalidStart(c));
278        }
279        Ok(())
280    }
281
282    /// special purpose, internal only function to return what would normally be an invalid tag
283    /// specifying the repo root commit. Allowing the root commit to be, e.g. packaged in iterators
284    /// containing AtomTags
285    pub(crate) fn root_tag() -> AtomTag {
286        AtomTag(ROOT_TAG.into())
287    }
288
289    pub(crate) fn is_root(&self) -> bool {
290        self == &AtomTag::root_tag()
291    }
292
293    pub(super) fn validate(s: &str) -> Result<(), Error> {
294        if s.len() > ID_MAX {
295            return Err(Error::TooLong);
296        }
297
298        match s.chars().next().map(AtomTag::validate_start) {
299            Some(Ok(())) => (),
300            Some(Err(e)) => return Err(e),
301            None => return Err(Error::Empty),
302        }
303
304        let invalid_chars: String = s.chars().filter(|&c| !AtomTag::is_valid_char(c)).collect();
305
306        if !invalid_chars.is_empty() {
307            return Err(Error::InvalidCharacters(invalid_chars));
308        }
309
310        Ok(())
311    }
312
313    pub(super) fn is_invalid_start(c: char) -> bool {
314        matches!(
315            GeneralCategory::of(c),
316            GeneralCategory::DecimalNumber | GeneralCategory::LetterNumber
317        ) || c == '_'
318            || c == '-'
319            || !AtomTag::is_valid_char(c)
320    }
321
322    pub(super) fn is_valid_char(c: char) -> bool {
323        matches!(
324            GeneralCategory::of(c),
325            GeneralCategory::LowercaseLetter
326                | GeneralCategory::UppercaseLetter
327                | GeneralCategory::TitlecaseLetter
328                | GeneralCategory::ModifierLetter
329                | GeneralCategory::OtherLetter
330                | GeneralCategory::DecimalNumber
331                | GeneralCategory::LetterNumber
332        ) || c == '-'
333            || c == '_'
334    }
335}
336
337impl Deref for AtomTag {
338    type Target = String;
339
340    fn deref(&self) -> &Self::Target {
341        &self.0
342    }
343}
344
345impl fmt::Display for AtomTag {
346    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
347        write!(f, "{}", self.0)
348    }
349}
350impl FromStr for AtomTag {
351    type Err = Error;
352
353    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
354        AtomTag::validate(s)?;
355        Ok(AtomTag(s.to_string()))
356    }
357}
358
359impl TryFrom<String> for AtomTag {
360    type Error = Error;
361
362    fn try_from(s: String) -> Result<Self, Self::Error> {
363        AtomTag::validate(&s)?;
364        Ok(AtomTag(s))
365    }
366}
367
368impl TryFrom<&OsStr> for AtomTag {
369    type Error = Error;
370
371    fn try_from(s: &OsStr) -> Result<Self, Self::Error> {
372        let s = s.to_str().ok_or(Error::InvalidUnicode)?;
373        AtomTag::from_str(s)
374    }
375}
376
377impl TryFrom<&str> for AtomTag {
378    type Error = Error;
379
380    fn try_from(s: &str) -> Result<Self, Self::Error> {
381        AtomTag::from_str(s)
382    }
383}
384
385use std::fmt::Display;
386
387impl<R> AtomId<R> {
388    /// Return a reference to the Atom's Unicode identifier.
389    pub fn tag(&self) -> &AtomTag {
390        &self.tag
391    }
392}
393
394impl<R> Display for AtomId<R>
395where
396    for<'id> AtomId<R>: Compute<'id, R>,
397{
398    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
399        let s = self.compute_hash();
400        if let Some(max_width) = f.precision() {
401            write!(f, "{s:.max_width$}")
402        } else {
403            write!(f, "{s}")
404        }
405    }
406}
407
408impl<'a, R> Display for IdHash<'a, R> {
409    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410        let s = base32::encode(crate::BASE32, &self.hash);
411        if let Some(max_width) = f.precision() {
412            write!(f, "{s:.max_width$}")
413        } else {
414            f.write_str(&s)
415        }
416    }
417}