atom/id/mod.rs
1//! # Atom Identification Constructs
2//!
3//! This module contains the foundational types and logic for working with Atom
4//! identifiers in a cryptographically secure namespace. This enables universal,
5//! collision-resistant addressing of software packages (atoms) across sources,
6//! with end-to-end integrity from origin to content.
7//!
8//! ## High-Level Vision
9//!
10//! The system is designed to create a layered, cryptographic address space for atoms,
11//! allowing unambiguous identification and retrieval across diverse repositories or
12//! sources. At a conceptual level, this involves:
13//! - An immutable **origin** identifier (e.g., a repository's root commit hash) to anchor the
14//! namespace and ensure domain separation.
15//! - A human-readable **tag** (moniker) within that origin, validated for descriptiveness and
16//! safety while enabling a vast Unicode-based character set within a single origin.
17//! - A machine-readable **id** combining the origin and tag into a globally unique identifier,
18//! represented as a cryptographic **hash** derived from the components of the id using BLAKE3
19//! (with the origin serving as a key in derivation), ensuring atoms with the same tag in
20//! different origins are cryptographically distinct.
21//!
22//!
23//! These primitives, coupled with the rest of an atom's components, enable diverse and efficient
24//! tooling capable of unambigiously indexing, querying and addressing software packages with
25//! cryptographically sound provenance meta-data from origin, to package identifier, to specific
26//! versions and their contents (e.g. via git content hashes).
27//!
28//! ## Key Concepts
29//!
30//! **Atom Tags** are Unicode identifiers that descriptively label
31//! atoms within an origin. They are validated to ensure they contain only safe
32//! characters and contribute to a vast address space for cryptographic disambiguation.
33//!
34//! **Atom Ids** are the Rust struct coupling a tag to its origin, ultimately represented by the
35//! BLAKE3-derived hash these components, providing a cryptographically secure, collision-resistant,
36//! and stable identifier for the atom itself. This ensures disambiguation across origins without
37//! tying directly to version-specific content (which may be handled in higher layers).
38//!
39//! ## Tag Validation Rules
40//!
41//! Atom Tags are validated on construction to ensure they serve as descriptive identifiers while
42//! providing a vast character set per origin, suitable for use as the human-readable component of
43//! an atom's cryptographic identity. This allows for meaningful Unicode characters across languages
44//! (beyond just ASCII/English) without permitting nonsensical or overly permissive
45//! content. Validation leverages Unicode general categories for letters and numbers.
46//!
47//! Atom Tags must:
48//! - Be valid UTF-8 encoded Unicode strings
49//! - Not exceed 128 bytes in length (measured in UTF-8 bytes)
50//! - Not be empty
51//! - Start with a Unicode letter (general categories: UppercaseLetter [Lu], LowercaseLetter [Ll],
52//! TitlecaseLetter [Lt], ModifierLetter [Lm], or OtherLetter [Lo]; not a number, underscore, or
53//! hyphen)
54//! - Contain only Unicode letters (as defined above), Unicode numbers (DecimalNumber [Nd] or
55//! LetterNumber [Nl]), hyphens (`-`), and underscores (`_`)
56//!
57//! ## Usage Example
58//!
59//! ```rust,no_run
60//! use atom::store::git::Root;
61//! use atom::{AtomId, AtomTag, Compute, Origin};
62//!
63//! // Create a validated atom tag
64//! let tag = AtomTag::try_from("my-atom").unwrap();
65//!
66//! // Create an AtomId with a Git origin
67//! let repo = gix::open(".").unwrap();
68//! let commit = repo
69//! .rev_parse_single("HEAD")
70//! .map(|s| repo.find_commit(s))
71//! .unwrap()
72//! .unwrap();
73//!
74//! let id = AtomId::construct(&commit, tag).unwrap();
75//!
76//! // Get the hash for disambiguated identification
77//! let hash = id.compute_hash();
78//! println!("Atom hash: {}", hash);
79//! ```
80#[cfg(test)]
81mod tests;
82
83use std::borrow::Borrow;
84use std::ffi::OsStr;
85use std::fmt;
86use std::ops::Deref;
87use std::str::FromStr;
88
89use serde::{Deserialize, Serialize, Serializer};
90use thiserror::Error;
91use unic_ucd_category::GeneralCategory;
92
93const ID_MAX: usize = 128;
94
95#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
96#[serde(try_from = "String")]
97/// A vetted String suitable for an atom's `tag` field
98pub struct AtomTag(String);
99
100#[derive(Error, Debug, PartialEq, Eq)]
101/// Errors that can occur during atom tag validation.
102///
103/// These errors represent various validation failures when creating or parsing
104/// atom identifiers, ensuring they conform to the required format and constraints
105/// for cryptographically secure identification.
106pub enum Error {
107 /// The atom identifier exceeds the maximum allowed length of 128 bytes.
108 ///
109 /// Atom tags are limited in size to ensure efficient storage and processing
110 /// while maintaining a sufficient character space for meaningful identifiers.
111 #[error("An Atom id cannot be more than {} bytes", ID_MAX)]
112 TooLong,
113
114 /// The atom identifier is empty.
115 ///
116 /// Empty identifiers are not allowed as they provide no meaningful
117 /// identification for the atom.
118 #[error("An Atom id cannot be empty")]
119 Empty,
120
121 /// The atom identifier starts with an invalid character.
122 ///
123 /// Atom identifiers must start with a Unicode letter (not numbers, hyphens,
124 /// underscores, or other non-letter characters) to ensure they are
125 /// descriptive and follow identifier conventions.
126 #[error("An Atom id cannot start with: '{0}'")]
127 InvalidStart(char),
128
129 /// The atom identifier contains invalid characters.
130 ///
131 /// Only Unicode letters, numbers, hyphens, and underscores are permitted
132 /// in atom identifiers. This ensures compatibility across different systems
133 /// and maintains readability.
134 #[error("The Atom id contains invalid characters: '{0}'")]
135 InvalidCharacters(String),
136
137 /// The atom identifier contains invalid Unicode.
138 ///
139 /// Atom identifiers must be valid UTF-8 encoded Unicode strings.
140 /// This error occurs when the string contains invalid byte sequences
141 /// or encoding issues.
142 #[error("An Atom id must be valid unicode")]
143 InvalidUnicode,
144}
145
146/// Trait for computing BLAKE3 hashes of AtomIds.
147///
148/// This trait is implemented for AtomId to provide a way to compute
149/// cryptographically secure hashes that can be used as unique identifiers
150/// for atoms in storage backends.
151pub trait Compute<'id, T>: Borrow<[u8]> {
152 /// Computes the BLAKE3 hash of this AtomId.
153 ///
154 /// The hash is computed using a key derived from the atom's root value,
155 /// ensuring that atoms with the same ID but different roots produce
156 /// different hashes.
157 ///
158 /// # Returns
159 ///
160 /// An `IdHash` containing the 32-byte BLAKE3 hash and a reference
161 /// to the original AtomId.
162 fn compute_hash(&'id self) -> IdHash<'id, T>;
163}
164
165/// This trait must be implemented to construct new instances of an an [`AtomId`].
166/// It tells the [`AtomId::construct`] constructor how to calculate the value for
167/// its `root` field.
168pub trait Origin<R> {
169 /// The error type returned by the [`Origin::calculate_origin`] method.
170 type Error;
171 /// The method used the calculate the root field for the [`AtomId`].
172 ///
173 /// # Errors
174 ///
175 /// This function will return an error if the calculation fails or is impossible.
176 fn calculate_origin(&self) -> Result<R, Self::Error>;
177}
178
179/// The type representing all the components necessary to serve as
180/// an unambiguous identifier. Atoms consist of a human-readable
181/// Unicode identifier, as well as a root field, which varies for
182/// each store implementation. For example, Git uses the oldest
183/// commit in a repositories history.
184#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
185pub struct AtomId<R> {
186 origin: R,
187 tag: AtomTag,
188}
189
190#[derive(Debug, Clone, PartialEq, Eq, Hash)]
191/// Represents the BLAKE3 hash of an AtomId.
192///
193/// This struct contains a 32-byte BLAKE3 hash that serves as a
194/// cryptographically secure, globally unique identifier for an atom.
195/// The hash is computed from the combination of the atom's human-readable
196/// ID and its context-specific origin value.
197pub struct IdHash<'id, T> {
198 /// The 32-byte BLAKE3 hash value
199 hash: [u8; 32],
200 /// Reference to the AtomId that was hashed
201 id: &'id AtomId<T>,
202}
203
204impl<R> Serialize for AtomId<R> {
205 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
206 where
207 S: Serializer,
208 {
209 // Serialize only the `tag` field as a string
210 self.tag.serialize(serializer)
211 }
212}
213
214impl<T> Deref for IdHash<'_, T> {
215 type Target = [u8; 32];
216
217 fn deref(&self) -> &Self::Target {
218 &self.hash
219 }
220}
221
222impl<'id, R: AsRef<[u8]>> Compute<'id, R> for AtomId<R> {
223 fn compute_hash(&'id self) -> IdHash<'id, R> {
224 use blake3::Hasher;
225
226 let key = blake3::derive_key("AtomId", self.origin.as_ref());
227
228 let mut hasher = Hasher::new_keyed(&key);
229 hasher.update(self.tag.as_bytes());
230 IdHash {
231 hash: *hasher.finalize().as_bytes(),
232 id: self,
233 }
234 }
235}
236
237impl<T> Borrow<[u8]> for AtomId<T> {
238 fn borrow(&self) -> &[u8] {
239 self.tag.as_bytes()
240 }
241}
242
243impl<R> AtomId<R>
244where
245 for<'id> AtomId<R>: Compute<'id, R>,
246{
247 /// Compute an atom's origin and construct its ID. This method takes a `src`
248 /// type which must implement the [`Origin`] struct.
249 ///
250 /// # Errors
251 ///
252 /// This function will return an error if the call to
253 /// [`Origin::calculate_origin`] fails.
254 pub fn construct<T>(src: &T, tag: AtomTag) -> Result<Self, T::Error>
255 where
256 T: Origin<R>,
257 {
258 let origin = src.calculate_origin()?;
259 Ok(AtomId { origin, tag })
260 }
261
262 /// The root field, which serves as a derived key for the blake-3 hash used to
263 /// identify the Atom in backend implementations.
264 pub fn root(&self) -> &R {
265 &self.origin
266 }
267}
268
269pub(crate) const ROOT_TAG: &str = "__ROOT";
270
271/// For contexts where you want an identifier that is validated similarly to an AtomTag.
272pub type Name = AtomTag;
273
274impl AtomTag {
275 fn validate_start(c: char) -> Result<(), Error> {
276 if AtomTag::is_invalid_start(c) {
277 return Err(Error::InvalidStart(c));
278 }
279 Ok(())
280 }
281
282 /// special purpose, internal only function to return what would normally be an invalid tag
283 /// specifying the repo root commit. Allowing the root commit to be, e.g. packaged in iterators
284 /// containing AtomTags
285 pub(crate) fn root_tag() -> AtomTag {
286 AtomTag(ROOT_TAG.into())
287 }
288
289 pub(crate) fn is_root(&self) -> bool {
290 self == &AtomTag::root_tag()
291 }
292
293 pub(super) fn validate(s: &str) -> Result<(), Error> {
294 if s.len() > ID_MAX {
295 return Err(Error::TooLong);
296 }
297
298 match s.chars().next().map(AtomTag::validate_start) {
299 Some(Ok(())) => (),
300 Some(Err(e)) => return Err(e),
301 None => return Err(Error::Empty),
302 }
303
304 let invalid_chars: String = s.chars().filter(|&c| !AtomTag::is_valid_char(c)).collect();
305
306 if !invalid_chars.is_empty() {
307 return Err(Error::InvalidCharacters(invalid_chars));
308 }
309
310 Ok(())
311 }
312
313 pub(super) fn is_invalid_start(c: char) -> bool {
314 matches!(
315 GeneralCategory::of(c),
316 GeneralCategory::DecimalNumber | GeneralCategory::LetterNumber
317 ) || c == '_'
318 || c == '-'
319 || !AtomTag::is_valid_char(c)
320 }
321
322 pub(super) fn is_valid_char(c: char) -> bool {
323 matches!(
324 GeneralCategory::of(c),
325 GeneralCategory::LowercaseLetter
326 | GeneralCategory::UppercaseLetter
327 | GeneralCategory::TitlecaseLetter
328 | GeneralCategory::ModifierLetter
329 | GeneralCategory::OtherLetter
330 | GeneralCategory::DecimalNumber
331 | GeneralCategory::LetterNumber
332 ) || c == '-'
333 || c == '_'
334 }
335}
336
337impl Deref for AtomTag {
338 type Target = String;
339
340 fn deref(&self) -> &Self::Target {
341 &self.0
342 }
343}
344
345impl fmt::Display for AtomTag {
346 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
347 write!(f, "{}", self.0)
348 }
349}
350impl FromStr for AtomTag {
351 type Err = Error;
352
353 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
354 AtomTag::validate(s)?;
355 Ok(AtomTag(s.to_string()))
356 }
357}
358
359impl TryFrom<String> for AtomTag {
360 type Error = Error;
361
362 fn try_from(s: String) -> Result<Self, Self::Error> {
363 AtomTag::validate(&s)?;
364 Ok(AtomTag(s))
365 }
366}
367
368impl TryFrom<&OsStr> for AtomTag {
369 type Error = Error;
370
371 fn try_from(s: &OsStr) -> Result<Self, Self::Error> {
372 let s = s.to_str().ok_or(Error::InvalidUnicode)?;
373 AtomTag::from_str(s)
374 }
375}
376
377impl TryFrom<&str> for AtomTag {
378 type Error = Error;
379
380 fn try_from(s: &str) -> Result<Self, Self::Error> {
381 AtomTag::from_str(s)
382 }
383}
384
385use std::fmt::Display;
386
387impl<R> AtomId<R> {
388 /// Return a reference to the Atom's Unicode identifier.
389 pub fn tag(&self) -> &AtomTag {
390 &self.tag
391 }
392}
393
394impl<R> Display for AtomId<R>
395where
396 for<'id> AtomId<R>: Compute<'id, R>,
397{
398 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
399 let s = self.compute_hash();
400 if let Some(max_width) = f.precision() {
401 write!(f, "{s:.max_width$}")
402 } else {
403 write!(f, "{s}")
404 }
405 }
406}
407
408impl<'a, R> Display for IdHash<'a, R> {
409 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
410 let s = base32::encode(crate::BASE32, &self.hash);
411 if let Some(max_width) = f.precision() {
412 write!(f, "{s:.max_width$}")
413 } else {
414 f.write_str(&s)
415 }
416 }
417}