001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.io.input; 019 020import static org.apache.commons.io.IOUtils.EOF; 021 022import java.io.IOException; 023import java.io.InputStream; 024import java.util.Arrays; 025import java.util.Comparator; 026import java.util.List; 027import java.util.Objects; 028 029import org.apache.commons.io.ByteOrderMark; 030import org.apache.commons.io.IOUtils; 031 032/** 033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. 034 * <p> 035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the first byte in the stream. 036 * </p> 037 * <p> 038 * The {@link ByteOrderMark} implementation has the following predefined BOMs: 039 * </p> 040 * <ul> 041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> 042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> 043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> 044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> 045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> 046 * </ul> 047 * <p> 048 * To build an instance, use {@link Builder}. 049 * </p> 050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2> 051 * 052 * <pre> 053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get(); 054 * if (bomIn.hasBOM()) { 055 * // has a UTF-8 BOM 056 * } 057 * </pre> 058 * 059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2> 060 * 061 * <pre> 062 * boolean include = true; 063 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).setInclude(include).get(); 064 * if (bomIn.hasBOM()) { 065 * // has a UTF-8 BOM 066 * } 067 * </pre> 068 * 069 * <h2>Example 3 - Detecting Multiple BOMs</h2> 070 * 071 * <pre> 072 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in) 073 * .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE).get(); 074 * if (bomIn.hasBOM() == false) { 075 * // No BOM found 076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { 077 * // has a UTF-16LE BOM 078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { 079 * // has a UTF-16BE BOM 080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { 081 * // has a UTF-32LE BOM 082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { 083 * // has a UTF-32BE BOM 084 * } 085 * </pre> 086 * <p> 087 * To build an instance, use {@link Builder}. 088 * </p> 089 * <p> 090 * This class is not thread-safe. 091 * </p> 092 * 093 * @see Builder 094 * @see org.apache.commons.io.ByteOrderMark 095 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> 096 * @since 2.0 097 */ 098public class BOMInputStream extends ProxyInputStream { 099 100 // @formatter:off 101 /** 102 * Builds a new {@link BOMInputStream}. 103 * 104 * <h2>Using NIO</h2> 105 * <pre>{@code 106 * BOMInputStream s = BOMInputStream.builder() 107 * .setPath(Paths.get("MyFile.xml")) 108 * .setByteOrderMarks(ByteOrderMark.UTF_8) 109 * .setInclude(false) 110 * .get();} 111 * </pre> 112 * <h2>Using IO</h2> 113 * <pre>{@code 114 * BOMInputStream s = BOMInputStream.builder() 115 * .setFile(new File("MyFile.xml")) 116 * .setByteOrderMarks(ByteOrderMark.UTF_8) 117 * .setInclude(false) 118 * .get();} 119 * </pre> 120 * 121 * @see #get() 122 * @since 2.12.0 123 */ 124 // @formatter:on 125 public static class Builder extends AbstractBuilder<BOMInputStream, Builder> { 126 127 private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 }; 128 129 /** 130 * For test access. 131 * 132 * @return the default byte order mark. 133 */ 134 static ByteOrderMark getDefaultByteOrderMark() { 135 return DEFAULT[0]; 136 } 137 138 private ByteOrderMark[] byteOrderMarks = DEFAULT; 139 private boolean include; 140 141 /** 142 * Constructs a new builder of {@link BOMInputStream}. 143 */ 144 public Builder() { 145 // empty 146 } 147 148 /** 149 * Builds a new {@link BOMInputStream}. 150 * <p> 151 * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception. 152 * </p> 153 * <p> 154 * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[]. 155 * </p> 156 * <p> 157 * This builder uses the following aspects: 158 * </p> 159 * <ul> 160 * <li>{@link #getInputStream()}</li> 161 * <li>include}</li> 162 * <li>byteOrderMarks</li> 163 * </ul> 164 * 165 * @return a new instance. 166 * @throws IllegalStateException if the {@code origin} is {@code null}. 167 * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}. 168 * @throws IOException if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}. 169 * @see #getInputStream() 170 * @see #getUnchecked() 171 */ 172 @Override 173 public BOMInputStream get() throws IOException { 174 return new BOMInputStream(this); 175 } 176 177 /** 178 * Sets the ByteOrderMarks to detect and optionally exclude. 179 * <p> 180 * The default is {@link ByteOrderMark#UTF_8}. 181 * </p> 182 * 183 * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude. 184 * @return {@code this} instance. 185 */ 186 public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) { 187 this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT; 188 return this; 189 } 190 191 /** 192 * Sets whether to include the UTF-8 BOM (true) or to exclude it (false). 193 * <p> 194 * The default is false. 195 * </p> 196 * 197 * @param include true to include the UTF-8 BOM or false to exclude it. return this;. 198 * @return {@code this} instance. 199 */ 200 public Builder setInclude(final boolean include) { 201 this.include = include; 202 return this; 203 } 204 } 205 206 /** 207 * Compares ByteOrderMark objects in descending length order. 208 */ 209 private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed(); 210 211 /** 212 * Constructs a new {@link Builder}. 213 * 214 * @return a new {@link Builder}. 215 * @since 2.12.0 216 */ 217 public static Builder builder() { 218 return new Builder(); 219 } 220 221 /** 222 * BOMs are sorted from longest to shortest. 223 */ 224 private final List<ByteOrderMark> bomList; 225 private final ByteOrderMark byteOrderMark; 226 private int fbIndex; 227 private int[] firstBytes; 228 private final boolean include; 229 private boolean markedAtStart; 230 private int markFbIndex; 231 232 /** 233 * Constructs a new instance. 234 * 235 * @param builder The builder. 236 * @throws IOException if an error reading the first bytes of the stream occurs. 237 */ 238 private BOMInputStream(final Builder builder) throws IOException { 239 super(builder); 240 if (IOUtils.length(builder.byteOrderMarks) == 0) { 241 throw new IllegalArgumentException("No ByteOrderMark specified."); 242 } 243 this.include = builder.include; 244 final List<ByteOrderMark> bomList = Arrays.asList(builder.byteOrderMarks); 245 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 246 bomList.sort(ByteOrderMarkLengthComparator); 247 this.bomList = bomList; 248 this.byteOrderMark = readBom(); 249 } 250 251 /** 252 * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. 253 * 254 * @param delegate the InputStream to delegate to. 255 * @throws IOException if an error reading the first bytes of the stream occurs. 256 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}. 257 */ 258 @Deprecated 259 public BOMInputStream(final InputStream delegate) throws IOException { 260 this(delegate, false, Builder.DEFAULT); 261 } 262 263 /** 264 * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it. 265 * 266 * @param delegate the InputStream to delegate to. 267 * @param include true to include the UTF-8 BOM or false to exclude it. 268 * @throws IOException if an error reading the first bytes of the stream occurs. 269 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}. 270 */ 271 @Deprecated 272 public BOMInputStream(final InputStream delegate, final boolean include) throws IOException { 273 this(delegate, include, Builder.DEFAULT); 274 } 275 276 /** 277 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. 278 * 279 * @param delegate the InputStream to delegate to. 280 * @param include true to include the specified BOMs or false to exclude them. 281 * @param boms The BOMs to detect and optionally exclude. 282 * @throws IOException if an error reading the first bytes of the stream occurs. 283 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}. 284 */ 285 @Deprecated 286 public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) throws IOException { 287 super(delegate); 288 if (IOUtils.length(boms) == 0) { 289 throw new IllegalArgumentException("No BOMs specified"); 290 } 291 this.include = include; 292 final List<ByteOrderMark> list = Arrays.asList(boms); 293 // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. 294 list.sort(ByteOrderMarkLengthComparator); 295 this.bomList = list; 296 this.byteOrderMark = readBom(); 297 } 298 299 /** 300 * Constructs a new BOM InputStream that excludes the specified BOMs. 301 * 302 * @param delegate the InputStream to delegate to. 303 * @param boms The BOMs to detect and exclude. 304 * @throws IOException if an error reading the first bytes of the stream occurs. 305 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 306 */ 307 @Deprecated 308 public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) throws IOException { 309 this(delegate, false, boms); 310 } 311 312 /** 313 * Finds a ByteOrderMark with the configured bytes in {@code bomList}. 314 * 315 * @return The matched BOM or null if none matched. 316 */ 317 private ByteOrderMark find() { 318 return bomList.stream().filter(this::matches).findFirst().orElse(null); 319 } 320 321 /** 322 * Gets the ByteOrderMark (Byte Order Mark). 323 * 324 * @return The BOM or null if none matched. 325 */ 326 public ByteOrderMark getBOM() { 327 return byteOrderMark; 328 } 329 330 /** 331 * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. 332 * 333 * @return The BOM charset Name or null if no BOM found. 334 * @throws IOException if an error reading the first bytes of the stream occurs. 335 */ 336 public String getBOMCharsetName() throws IOException { 337 return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); 338 } 339 340 /** 341 * Tests whether the stream contains one of the specified BOMs. 342 * 343 * @return true if the stream has one of the specified BOMs, otherwise false if it does not. 344 * @throws IOException if an error reading the first bytes of the stream occurs. 345 */ 346 public boolean hasBOM() throws IOException { 347 return getBOM() != null; 348 } 349 350 /** 351 * Tests whether the stream contains the specified BOM. 352 * 353 * @param bom The BOM to check for. 354 * @return true if the stream has the specified BOM, otherwise false if it does not. 355 * @throws IllegalArgumentException if the BOM is not one the stream is configured to detect. 356 * @throws IOException if an error reading the first bytes of the stream occurs. 357 */ 358 public boolean hasBOM(final ByteOrderMark bom) throws IOException { 359 if (!bomList.contains(bom)) { 360 throw new IllegalArgumentException("Stream not configured to detect " + bom); 361 } 362 return Objects.equals(getBOM(), bom); 363 } 364 365 /** 366 * Invokes the delegate's {@link InputStream#mark(int)} method. 367 * 368 * @param readLimit read ahead limit. 369 */ 370 @Override 371 public synchronized void mark(final int readLimit) { 372 markFbIndex = fbIndex; 373 markedAtStart = firstBytes == null; 374 in.mark(readLimit); 375 } 376 377 /** 378 * Checks if the bytes match a BOM. 379 * 380 * @param bom The BOM. 381 * @return true if the bytes match the BOM, otherwise false. 382 */ 383 private boolean matches(final ByteOrderMark bom) { 384 return bom.matches(firstBytes); 385 } 386 387 /** 388 * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM. 389 * 390 * @return the byte read (excluding BOM) or -1 if the end of stream. 391 * @throws IOException if an I/O error occurs. 392 */ 393 @Override 394 public int read() throws IOException { 395 checkOpen(); 396 final int b = readFirstBytes(); 397 return b >= 0 ? b : in.read(); 398 } 399 400 /** 401 * Invokes the delegate's {@link InputStream#read(byte[])} method, detecting and optionally skipping BOM. 402 * 403 * @param buf the buffer to read the bytes into, never {@code null} 404 * @return the number of bytes read (excluding BOM) or -1 if the end of stream. 405 * @throws NullPointerException if the buffer is {@code null} 406 * @throws IOException if an I/O error occurs. 407 */ 408 @Override 409 public int read(final byte[] buf) throws IOException { 410 return read(buf, 0, buf.length); 411 } 412 413 /** 414 * Invokes the delegate's {@link InputStream#read(byte[], int, int)} method, detecting and optionally skipping BOM. 415 * 416 * @param buf the buffer to read the bytes into. 417 * @param off The start offset. 418 * @param len The number of bytes to read (excluding BOM). 419 * @return the number of bytes read or -1 if the end of stream. 420 * @throws NullPointerException if the buffer is {@code null}. 421 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code buf.length}. 422 * @throws IOException if an I/O error occurs. 423 */ 424 @Override 425 public int read(final byte[] buf, int off, int len) throws IOException { 426 IOUtils.checkFromIndexSize(buf, off, len); 427 if (len == 0) { 428 return 0; 429 } 430 int firstCount = 0; 431 int b = 0; 432 while (len > 0 && b >= 0) { 433 b = readFirstBytes(); 434 if (b >= 0) { 435 buf[off++] = (byte) (b & 0xFF); 436 len--; 437 firstCount++; 438 } 439 } 440 final int secondCount = in.read(buf, off, len); 441 afterRead(secondCount); 442 return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; 443 } 444 445 /** 446 * Reads the byte order mark. 447 * 448 * @return the byte order mark. 449 * @throws IOException if an error reading the first bytes of the stream occurs. 450 */ 451 private ByteOrderMark readBom() throws IOException { 452 int fbLength = 0; 453 // BOMs are sorted from longest to shortest 454 final int maxBomSize = bomList.get(0).length(); 455 final int[] tmp = new int[maxBomSize]; 456 // Read first maxBomSize bytes 457 for (int i = 0; i < tmp.length; i++) { 458 tmp[i] = in.read(); 459 afterRead(tmp[i]); 460 fbLength++; 461 if (tmp[i] < 0) { 462 break; 463 } 464 } 465 firstBytes = Arrays.copyOf(tmp, fbLength); 466 // match BOM in firstBytes 467 final ByteOrderMark bom = find(); 468 if (bom != null && !include) { 469 if (bom.length() < firstBytes.length) { 470 fbIndex = bom.length(); 471 } else { 472 firstBytes = new int[0]; 473 } 474 } 475 return bom; 476 } 477 478 /** 479 * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a 480 * valid byte or -1 to indicate that the initial bytes have been processed already. 481 * 482 * @return the byte read (excluding BOM) or -1 if at the end of first bytes. 483 * @throws IOException if an I/O error occurs. 484 */ 485 private int readFirstBytes() throws IOException { 486 return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF; 487 } 488 489 /** 490 * Invokes the delegate's {@link InputStream#reset()} method. 491 * 492 * @throws IOException if an I/O error occurs. 493 */ 494 @Override 495 public synchronized void reset() throws IOException { 496 fbIndex = markFbIndex; 497 if (markedAtStart) { 498 firstBytes = null; 499 } 500 in.reset(); 501 } 502 503 /** 504 * Invokes the delegate's {@link InputStream#skip(long)} method, detecting and optionally skipping BOM. 505 * 506 * @param n the number of bytes to skip. 507 * @return the number of bytes to skipped or -1 if the end of stream. 508 * @throws IOException if an I/O error occurs. 509 */ 510 @Override 511 public long skip(final long n) throws IOException { 512 int skipped = 0; 513 while (n > skipped && readFirstBytes() >= 0) { 514 skipped++; 515 } 516 return in.skip(n - skipped) + skipped; 517 } 518}