001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import java.io.ByteArrayOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.nio.charset.StandardCharsets; 024import java.nio.file.DirectoryStream; 025import java.nio.file.Files; 026import java.nio.file.Path; 027import java.security.MessageDigest; 028import java.util.HashMap; 029import java.util.Map; 030import java.util.Objects; 031import java.util.Set; 032import java.util.TreeSet; 033import java.util.function.Supplier; 034 035/** 036 * Computes <a href="https://git-scm.com/">Git</a> object identifiers and their generalizations described by the 037 * <a href="https://www.swhid.org/swhid-specification/">SWHID specification</a>. 038 * 039 * <p>When the hash algorithm is SHA-1, the identifiers produced by this class are identical to those used by Git. 040 * Other hash algorithms produce generalized identifiers as described by the SWHID specification.</p> 041 * 042 * <p>This class is immutable and thread-safe. However, the {@link MessageDigest} instances passed to it generally won't be.</p> 043 * 044 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> 045 * @see <a href="https://www.swhid.org/swhid-specification/">SWHID Specification</a> 046 * @since 1.22.0 047 */ 048public class GitIdentifiers { 049 050 /** 051 * Represents a single entry in a Git tree object. 052 * 053 * <p>A Git tree object encodes a directory snapshot. Each entry holds:</p> 054 * <ul> 055 * <li>a {@link FileMode} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li> 056 * <li>the entry name (file or directory name, without a path separator),</li> 057 * <li>the raw object id of the referenced blob or sub-tree.</li> 058 * </ul> 059 * 060 * <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/} 061 * sorts after {@code foobar}.</p> 062 * 063 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> 064 * @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a> 065 */ 066 static class DirectoryEntry implements Comparable<DirectoryEntry> { 067 068 /** 069 * The entry name (file or directory name, no path separator). 070 */ 071 private final String name; 072 073 /** 074 * The raw object id of the referenced blob or sub-tree. 075 */ 076 private final byte[] rawObjectId; 077 078 /** 079 * The key used for ordering entries within a tree object. 080 * 081 * <p>>Git appends {@code '/'} to directory names before comparing.</p> 082 */ 083 private final String sortKey; 084 085 /** 086 * The Git object type, which determines the Unix file-mode prefix. 087 */ 088 private final FileMode type; 089 090 /** 091 * Constructs a new entry. 092 * 093 * @param name The name of the entry, not containing {@code '/'}. 094 * @param type The type of the entry, not null. 095 * @param rawObjectId The id of the entry, not null. 096 */ 097 DirectoryEntry(final String name, final FileMode type, final byte[] rawObjectId) { 098 if (Objects.requireNonNull(name).indexOf('/') >= 0) { 099 throw new IllegalArgumentException("Entry name must not contain '/': " + name); 100 } 101 this.name = name; 102 this.type = Objects.requireNonNull(type); 103 this.sortKey = type == FileMode.DIRECTORY ? name + "/" : name; 104 this.rawObjectId = Objects.requireNonNull(rawObjectId); 105 } 106 107 @Override 108 public int compareTo(final DirectoryEntry o) { 109 return sortKey.compareTo(o.sortKey); 110 } 111 112 @Override 113 public boolean equals(final Object obj) { 114 if (obj == this) { 115 return true; 116 } 117 if (!(obj instanceof DirectoryEntry)) { 118 return false; 119 } 120 final DirectoryEntry other = (DirectoryEntry) obj; 121 return name.equals(other.name); 122 } 123 124 @Override 125 public int hashCode() { 126 return name.hashCode(); 127 } 128 129 } 130 131 /** 132 * The type of a Git tree entry, which maps to a Unix file-mode string. 133 * 134 * <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here 135 * cover the four entry types that Git itself produces.</p> 136 * 137 * @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a> 138 */ 139 public enum FileMode { 140 141 /** 142 * A subdirectory. Subdirectories can only be specified by SHA or through a tree mark set with {@code --import-marks}. 143 * 144 * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a> 145 */ 146 DIRECTORY(new byte[] { '4', '0', '0', '0', '0' }), 147 148 /** 149 * A regular, but executable, file. 150 */ 151 EXECUTABLE(new byte[] { '1', '0', '0', '7', '5', '5' }), 152 153 /** 154 * A gitlink, SHA-1 of the object refers to a commit in another repository. Git links can only be specified either by SHA or through a commit mark. They 155 * are used to implement submodules. 156 * 157 * @see <a href="https://git-scm.com/docs/gitdatamodel">gitdatamodel - Git's core data model</a> 158 * @see <a href="https://git-scm.com/docs/git-fast-import">git-fast-import - Backend for fast Git data importers</a> 159 */ 160 GIT_LINK(new byte[] { '1', '6', '0', '0', '0', '0' }), 161 162 /** 163 * A regular (non-executable) file. 164 * <p> 165 * The majority of files in most projects use this mode. If in doubt, this is what you want. 166 * </p> 167 */ 168 REGULAR(new byte[] { '1', '0', '0', '6', '4', '4' }), 169 170 /** 171 * A symbolic link. The content of the file will be the link target. 172 */ 173 SYMBOLIC_LINK(new byte[] { '1', '2', '0', '0', '0', '0' }); 174 175 private static FileMode get(final Path path) { 176 // Symbolic links first 177 if (Files.isSymbolicLink(path)) { 178 return SYMBOLIC_LINK; 179 } 180 if (Files.isDirectory(path)) { 181 return DIRECTORY; 182 } 183 if (Files.isExecutable(path)) { 184 return EXECUTABLE; 185 } 186 return REGULAR; 187 } 188 189 /** 190 * Serialized {@code mode}: since this is mutable, it must remain private. 191 */ 192 private final byte[] modeBytes; 193 194 FileMode(final byte[] modeBytes) { 195 // No need for a defensive copy since the array is private and never exposed, 196 this.modeBytes = modeBytes; 197 } 198 } 199 200 /** 201 * Builds a Git tree identifier for a virtual directory structure, such as the contents of 202 * an archive. 203 */ 204 public static final class TreeIdBuilder implements Supplier<byte[]> { 205 206 /** 207 * Supplies a blob identifier that may throw {@link IOException}. 208 */ 209 @FunctionalInterface 210 private interface BlobIdSupplier { 211 byte[] get() throws IOException; 212 } 213 214 private static String requireNoParentTraversal(final String name) { 215 if ("..".equals(name)) { 216 throw new IllegalArgumentException("Path component not allowed: " + name); 217 } 218 return name; 219 } 220 221 private final Map<String, TreeIdBuilder> dirEntries = new HashMap<>(); 222 private final Map<String, DirectoryEntry> fileEntries = new HashMap<>(); 223 private final MessageDigest messageDigest; 224 225 private TreeIdBuilder(final MessageDigest messageDigest) { 226 this.messageDigest = Objects.requireNonNull(messageDigest); 227 } 228 229 /** 230 * Adds and returns the {@link TreeIdBuilder} for the named subdirectory, creating it if absent. 231 * 232 * @param name The relative path of the subdirectory in normalized form (may contain {@code '/'}). 233 * @return The {@link TreeIdBuilder} for the subdirectory. 234 * @throws IllegalArgumentException If any path component is {@code ".."}. 235 */ 236 public TreeIdBuilder addDirectory(final String name) { 237 TreeIdBuilder current = this; 238 for (final String component : name.split("/", -1)) { 239 // Noop segments 240 if (component.isEmpty() || ".".equals(component)) { 241 continue; 242 } 243 current = current.dirEntries.computeIfAbsent(requireNoParentTraversal(component), k -> new TreeIdBuilder(messageDigest)); 244 } 245 return current; 246 } 247 248 private void addFile(final FileMode mode, final String name, final BlobIdSupplier blobId) throws IOException { 249 final int slash = name.lastIndexOf('/'); 250 if (slash < 0) { 251 fileEntries.put(name, new DirectoryEntry(requireNoParentTraversal(name), mode, blobId.get())); 252 } else { 253 addDirectory(name.substring(0, slash)).addFile(mode, name.substring(slash + 1), blobId); 254 } 255 } 256 257 /** 258 * Adds a file entry at the given path within this tree. 259 * 260 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> 261 * 262 * @param mode The file mode (e.g. {@link FileMode#REGULAR}). 263 * @param name The relative path of the entry in normalized form(may contain {@code '/'}). 264 * @param data The file content. 265 * @throws IOException If an I/O error occurs. 266 * @throws IllegalArgumentException If any path component is {@code ".."}. 267 */ 268 public void addFile(final FileMode mode, final String name, final byte[] data) throws IOException { 269 addFile(mode, name, () -> blobId(messageDigest, data)); 270 } 271 272 /** 273 * Adds a file entry at the given path within this tree, streaming content without buffering. 274 * 275 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> 276 * 277 * <p>The stream is eagerly drained.</p> 278 * 279 * @param mode The file mode (e.g. {@link FileMode#REGULAR}). 280 * @param name The relative path of the entry in normalized form(may contain {@code '/'}). 281 * @param dataSize The exact number of bytes in {@code data}. 282 * @param data The file content. 283 * @throws IOException If the stream cannot be read. 284 * @throws IllegalArgumentException If any path component is {@code ".."}. 285 */ 286 public void addFile(final FileMode mode, final String name, final long dataSize, final InputStream data) throws IOException { 287 addFile(mode, name, () -> blobId(messageDigest, dataSize, data)); 288 } 289 290 /** 291 * Adds a symbolic link entry at the given path within this tree. 292 * 293 * <p>If {@code name} contains {@code '/'}, intermediate subdirectories are created automatically.</p> 294 * 295 * @param name The relative path of the entry in normalized form(may contain {@code '/'}). 296 * @param target The target of the symbolic link. 297 * @throws IOException If an I/O error occurs. 298 * @throws IllegalArgumentException If any path component is {@code ".."}. 299 */ 300 public void addSymbolicLink(final String name, final String target) throws IOException { 301 addFile(FileMode.SYMBOLIC_LINK, name, target.getBytes(StandardCharsets.UTF_8)); 302 } 303 304 /** 305 * Computes the Git tree identifier for this directory and all its descendants. 306 * 307 * @return The raw tree identifier bytes. 308 */ 309 @Override 310 public byte[] get() { 311 final Set<DirectoryEntry> entries = new TreeSet<>(fileEntries.values()); 312 dirEntries.forEach((k, v) -> entries.add(new DirectoryEntry(k, FileMode.DIRECTORY, v.get()))); 313 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 314 for (final DirectoryEntry entry : entries) { 315 baos.write(entry.type.modeBytes, 0, entry.type.modeBytes.length); 316 baos.write(' '); 317 final byte[] bytes = entry.name.getBytes(StandardCharsets.UTF_8); 318 baos.write(bytes, 0, bytes.length); 319 baos.write('\0'); 320 baos.write(entry.rawObjectId, 0, entry.rawObjectId.length); 321 } 322 messageDigest.reset(); 323 DigestUtils.updateDigest(messageDigest, getGitTreePrefix(baos.size())); 324 return DigestUtils.updateDigest(messageDigest, baos.toByteArray()).digest(); 325 } 326 327 private TreeIdBuilder populate(final Path directory) throws IOException { 328 try (DirectoryStream<Path> files = Files.newDirectoryStream(directory)) { 329 for (final Path path : files) { 330 final String name = Objects.toString(path.getFileName()); 331 final FileMode mode = FileMode.get(path); 332 if (mode == FileMode.DIRECTORY) { 333 addDirectory(name).populate(path); 334 } else { 335 addFile(mode, name, () -> blobId(messageDigest, path)); 336 } 337 } 338 } 339 return this; 340 } 341 } 342 343 /** 344 * Reads through a byte array and returns a generalized Git blob identifier. 345 * 346 * <p>The identifier is computed in the way described by the 347 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash 348 * algorithm.</p> 349 * 350 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> 351 * 352 * @param messageDigest The MessageDigest to use (for example SHA-1). 353 * @param data Data to digest. 354 * @return A generalized Git blob identifier. 355 */ 356 public static byte[] blobId(final MessageDigest messageDigest, final byte[] data) { 357 messageDigest.reset(); 358 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(data.length)); 359 return DigestUtils.digest(messageDigest, data); 360 } 361 362 /** 363 * Reads through a stream of known size and returns a generalized Git blob identifier, without buffering. 364 * 365 * <p>When the size of the content is known in advance, this overload streams {@code data} directly through 366 * the digest without buffering the full content in memory.</p> 367 * 368 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> 369 * 370 * @param messageDigest The MessageDigest to use (for example SHA-1). 371 * @param dataSize The exact number of bytes in {@code data}. 372 * @param data Stream to digest. 373 * @return A generalized Git blob identifier. 374 * @throws IOException On error reading the stream. 375 */ 376 public static byte[] blobId(final MessageDigest messageDigest, final long dataSize, final InputStream data) throws IOException { 377 messageDigest.reset(); 378 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(dataSize)); 379 return DigestUtils.updateDigest(messageDigest, data).digest(); 380 } 381 382 /** 383 * Reads through a file and returns a generalized Git blob identifier. 384 * 385 * <p>The identifier is computed in the way described by the 386 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash 387 * algorithm.</p> 388 * 389 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p> 390 * 391 * @param messageDigest The MessageDigest to use (for example SHA-1). 392 * @param data Path to the file to digest. 393 * @return A generalized Git blob identifier. 394 * @throws IOException On error accessing the file. 395 */ 396 public static byte[] blobId(final MessageDigest messageDigest, final Path data) throws IOException { 397 if (Files.isSymbolicLink(data)) { 398 final byte[] linkTarget = Files.readSymbolicLink(data).toString().getBytes(StandardCharsets.UTF_8); 399 return blobId(messageDigest, linkTarget); 400 } 401 messageDigest.reset(); 402 DigestUtils.updateDigest(messageDigest, getGitBlobPrefix(Files.size(data))); 403 return DigestUtils.updateDigest(messageDigest, data).digest(); 404 } 405 406 private static byte[] getGitBlobPrefix(final long dataSize) { 407 return getGitPrefix("blob", dataSize); 408 } 409 410 private static byte[] getGitPrefix(final String type, final long dataSize) { 411 return (type + " " + dataSize + "\0").getBytes(StandardCharsets.UTF_8); 412 } 413 414 private static byte[] getGitTreePrefix(final long dataSize) { 415 return getGitPrefix("tree", dataSize); 416 } 417 418 /** 419 * Reads through a directory and returns a generalized Git tree identifier. 420 * 421 * <p>The identifier is computed in the way described by the 422 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash 423 * algorithm.</p> 424 * 425 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p> 426 * 427 * @param messageDigest The MessageDigest to use (for example SHA-1). 428 * @param data Path to the directory to digest. 429 * @return A generalized Git tree identifier. 430 * @throws IOException On error accessing the directory or its contents. 431 */ 432 public static byte[] treeId(final MessageDigest messageDigest, final Path data) throws IOException { 433 return treeIdBuilder(messageDigest).populate(data).get(); 434 } 435 436 /** 437 * Returns a new {@link TreeIdBuilder} for constructing a generalized Git tree identifier from a virtual directory 438 * structure, such as the contents of an archive. 439 * 440 * <p>The identifier is computed in the way described by the 441 * <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash 442 * algorithm.</p> 443 * 444 * <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p> 445 * 446 * @param messageDigest The MessageDigest to use (for example SHA-1). 447 * @return A new {@link TreeIdBuilder}. 448 */ 449 public static TreeIdBuilder treeIdBuilder(final MessageDigest messageDigest) { 450 return new TreeIdBuilder(messageDigest); 451 } 452 453 private GitIdentifiers() { 454 // utility class 455 } 456}