001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.imaging.formats.jpeg.iptc; 019 020import static org.apache.commons.imaging.common.BinaryFunctions.read2Bytes; 021import static org.apache.commons.imaging.common.BinaryFunctions.read4Bytes; 022import static org.apache.commons.imaging.common.BinaryFunctions.readByte; 023import static org.apache.commons.imaging.common.BinaryFunctions.readBytes; 024import static org.apache.commons.imaging.common.BinaryFunctions.slice; 025import static org.apache.commons.imaging.common.BinaryFunctions.startsWith; 026 027import java.io.ByteArrayInputStream; 028import java.io.ByteArrayOutputStream; 029import java.io.IOException; 030import java.io.InputStream; 031import java.nio.ByteOrder; 032import java.nio.charset.Charset; 033import java.nio.charset.StandardCharsets; 034import java.util.ArrayList; 035import java.util.Arrays; 036import java.util.Comparator; 037import java.util.List; 038import java.util.Objects; 039import java.util.logging.Level; 040import java.util.logging.Logger; 041 042import org.apache.commons.imaging.ImageReadException; 043import org.apache.commons.imaging.ImageWriteException; 044import org.apache.commons.imaging.ImagingConstants; 045import org.apache.commons.imaging.ImagingParameters; 046import org.apache.commons.imaging.common.BinaryFileParser; 047import org.apache.commons.imaging.common.BinaryFunctions; 048import org.apache.commons.imaging.common.BinaryOutputStream; 049import org.apache.commons.imaging.common.ByteConversions; 050import org.apache.commons.imaging.formats.jpeg.JpegConstants; 051import org.apache.commons.imaging.internal.Debug; 052 053public class IptcParser extends BinaryFileParser { 054 055 private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName()); 056 057 private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN; 058 059 /** 060 * Block types (or Image Resource IDs) that are not recommended to be 061 * interpreted when libraries process Photoshop IPTC metadata. 062 * 063 * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/"> Adobe Photoshop File Formats Specification</a> 064 * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246>IMAGING-246</a> 065 * @since 1.0-alpha2 066 */ 067 private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087); 068 069 private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; 070 private static final int ENV_TAG_CODED_CHARACTER_SET = 90; 071 private static final byte[] CHARACTER_ESCAPE_SEQUENCE = {'\u001B', '%', 'G'}; 072 073 public IptcParser() { 074 setByteOrder(ByteOrder.BIG_ENDIAN); 075 } 076 077 public boolean isPhotoshopJpegSegment(final byte[] segmentData) { 078 if (!startsWith(segmentData, 079 JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING)) { 080 return false; 081 } 082 083 final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(); 084 return (index + 4) <= segmentData.length 085 && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM; 086 } 087 088 /* 089 * In practice, App13 segments are only used for Photoshop/IPTC metadata. 090 * However, we should not treat App13 signatures without Photoshop's 091 * signature as Photoshop/IPTC segments. 092 * 093 * A Photoshop/IPTC App13 segment begins with the Photoshop Identification 094 * string. 095 * 096 * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks"). 097 * 098 * Each block has the following structure: 099 * 100 * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 101 * segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka. 102 * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This 103 * is padded to have an even length. 4. 4-byte size (in bytes). 5. Block 104 * data. This is also padded to have an even length. 105 * 106 * The block data consists of a 0-N records. A record has the following 107 * structure: 108 * 109 * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The 110 * record types are documented by the IPTC. See IptcConstants. 3. 2-byte 111 * record size (in bytes). 4. Record data, "record size" bytes long. 112 * 113 * Record data (unlike block data) is NOT padded to have an even length. 114 * 115 * Record data, for IPTC record, should always be ISO-8859-1. But according 116 * to SANSELAN-33, this isn't always the case. 117 * 118 * The exception is the first record in the block, which must always be a 119 * record version record, whose value is a two-byte number; the value is 120 * 0x02. 121 * 122 * Some IPTC blocks are missing this first "record version" record, so we 123 * don't require it. 124 */ 125 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters params) 126 throws ImageReadException, IOException { 127 final boolean strict = params != null && params.isStrict(); 128 129 return parsePhotoshopSegment(bytes, strict); 130 } 131 132 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImageReadException, 133 IOException { 134 final List<IptcRecord> records = new ArrayList<>(); 135 136 final List<IptcBlock> blocks = parseAllBlocks(bytes, strict); 137 138 for (final IptcBlock block : blocks) { 139 // Ignore everything but IPTC data. 140 if (!block.isIPTCBlock()) { 141 continue; 142 } 143 144 records.addAll(parseIPTCBlock(block.getBlockData())); 145 } 146 147 return new PhotoshopApp13Data(records, blocks); 148 } 149 150 protected List<IptcRecord> parseIPTCBlock(final byte[] bytes) { 151 Charset charset = DEFAULT_CHARSET; 152 final List<IptcRecord> elements = new ArrayList<>(); 153 154 int index = 0; 155 // Integer recordVersion = null; 156 while (index + 1 < bytes.length) { 157 final int tagMarker = 0xff & bytes[index++]; 158 Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")"); 159 160 if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) { 161 if (LOGGER.isLoggable(Level.FINE)) { 162 LOGGER.fine("Unexpected record tag marker in IPTC data."); 163 } 164 return elements; 165 } 166 167 final int recordNumber = 0xff & bytes[index++]; 168 Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")"); 169 170 // int recordPrefix = convertByteArrayToShort("recordPrefix", index, 171 // bytes); 172 // if (verbose) 173 // Debug.debug("recordPrefix", recordPrefix + " (0x" 174 // + Integer.toHexString(recordPrefix) + ")"); 175 // index += 2; 176 // 177 // if (recordPrefix != IPTC_RECORD_PREFIX) 178 // { 179 // if (verbose) 180 // System.out 181 // .println("Unexpected record prefix in IPTC data!"); 182 // return elements; 183 // } 184 185 // throw new ImageReadException( 186 // "Unexpected record prefix in IPTC data."); 187 188 final int recordType = 0xff & bytes[index]; 189 Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")"); 190 index++; 191 192 final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder()); 193 index += 2; 194 195 final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE; 196 final int dataFieldCountLength = recordSize & 0x7fff; 197 if (extendedDataset) { 198 Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength); 199 } 200 if (extendedDataset) { 201 // ignore extended dataset and everything after. 202 return elements; 203 } 204 205 final byte[] recordData = slice(bytes, index, recordSize); 206 index += recordSize; 207 208 // Debug.debug("recordSize", recordSize + " (0x" 209 // + Integer.toHexString(recordSize) + ")"); 210 211 if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) { 212 charset = findCharset(recordData); 213 continue; 214 } 215 216 if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) { 217 continue; 218 } 219 220 if (recordType == 0) { 221 if (LOGGER.isLoggable(Level.FINE)) { 222 LOGGER.fine("ignore record version record! " + elements.size()); 223 } 224 // ignore "record version" record; 225 continue; 226 } 227 // if (recordVersion == null) 228 // { 229 // // The first record in a JPEG/Photoshop IPTC block must be 230 // // the record version. 231 // if (recordType != 0) 232 // throw new ImageReadException("Missing record version: " 233 // + recordType); 234 // recordVersion = new Integer(convertByteArrayToShort( 235 // "recordNumber", recordData)); 236 // 237 // if (recordSize != 2) 238 // throw new ImageReadException( 239 // "Invalid record version record size: " + recordSize); 240 // 241 // // JPEG/Photoshop IPTC metadata is always in Record version 242 // // 2 243 // if (recordVersion.intValue() != 2) 244 // throw new ImageReadException( 245 // "Invalid IPTC record version: " + recordVersion); 246 // 247 // // Debug.debug("recordVersion", recordVersion); 248 // continue; 249 // } 250 251 final String value = new String(recordData, charset); 252 253 final IptcType iptcType = IptcTypeLookup.getIptcType(recordType); 254 255 // Debug.debug("iptcType", iptcType); 256 // debugByteArray("iptcData", iptcData); 257 // Debug.debug(); 258 259 // if (recordType == IPTC_TYPE_CREDIT.type 260 // || recordType == IPTC_TYPE_OBJECT_NAME.type) 261 // { 262 // this.debugByteArray("recordData", recordData); 263 // Debug.debug("index", IPTC_TYPE_CREDIT.name); 264 // } 265 266 final IptcRecord element = new IptcRecord(iptcType, value); 267 elements.add(element); 268 } 269 270 return elements; 271 } 272 273 protected List<IptcBlock> parseAllBlocks(final byte[] bytes, 274 final boolean strict) throws ImageReadException, IOException { 275 final List<IptcBlock> blocks = new ArrayList<>(); 276 277 try (InputStream bis = new ByteArrayInputStream(bytes)) { 278 279 // Note that these are unsigned quantities. Name is always an even 280 // number of bytes (including the 1st byte, which is the size.) 281 282 final byte[] idString = readBytes("", bis, 283 JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(), 284 "App13 Segment missing identification string"); 285 if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) { 286 throw new ImageReadException("Not a Photoshop App13 Segment"); 287 } 288 289 // int index = PHOTOSHOP_IDENTIFICATION_STRING.length; 290 291 while (true) { 292 final int imageResourceBlockSignature; 293 try { 294 imageResourceBlockSignature = read4Bytes("", bis, 295 "Image Resource Block missing identification string", APP13_BYTE_ORDER); 296 } catch (final IOException ioEx) { 297 break; 298 } 299 if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) { 300 throw new ImageReadException( 301 "Invalid Image Resource Block Signature"); 302 } 303 304 final int blockType = read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER); 305 Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 306 307 // skip blocks that the photoshop spec recommends to, see IMAGING-246 308 if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) { 309 Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 310 // if there is still data in this block, before the next image resource block 311 // (8BIM), then we must consume these bytes to leave a pointer ready to read 312 // the next block 313 BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis); 314 continue; 315 } 316 317 final int blockNameLength = readByte("Name length", bis, "Image Resource Block missing name length"); 318 if (blockNameLength > 0) { 319 Debug.debug("blockNameLength: " + blockNameLength + " (0x" 320 + Integer.toHexString(blockNameLength) + ")"); 321 } 322 byte[] blockNameBytes; 323 if (blockNameLength == 0) { 324 readByte("Block name bytes", bis, "Image Resource Block has invalid name"); 325 blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY; 326 } else { 327 try { 328 blockNameBytes = readBytes("", bis, blockNameLength, 329 "Invalid Image Resource Block name"); 330 } catch (final IOException ioEx) { 331 if (strict) { 332 throw ioEx; 333 } 334 break; 335 } 336 337 if (blockNameLength % 2 == 0) { 338 readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 339 } 340 } 341 342 final int blockSize = read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER); 343 Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")"); 344 345 /* 346 * doesn't catch cases where blocksize is invalid but is still less 347 * than bytes.length but will at least prevent OutOfMemory errors 348 */ 349 if (blockSize > bytes.length) { 350 throw new ImageReadException("Invalid Block Size : " + blockSize + " > " + bytes.length); 351 } 352 353 final byte[] blockData; 354 try { 355 blockData = readBytes("", bis, blockSize, "Invalid Image Resource Block data"); 356 } catch (final IOException ioEx) { 357 if (strict) { 358 throw ioEx; 359 } 360 break; 361 } 362 363 blocks.add(new IptcBlock(blockType, blockNameBytes, blockData)); 364 365 if ((blockSize % 2) != 0) { 366 readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 367 } 368 } 369 370 return blocks; 371 } 372 } 373 374 // private void writeIPTCRecord(BinaryOutputStream bos, ) 375 376 public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) 377 throws IOException, ImageWriteException { 378 final ByteArrayOutputStream os = new ByteArrayOutputStream(); 379 final BinaryOutputStream bos = new BinaryOutputStream(os); 380 381 JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos); 382 383 final List<IptcBlock> blocks = data.getRawBlocks(); 384 for (final IptcBlock block : blocks) { 385 bos.write4Bytes(JpegConstants.CONST_8BIM); 386 387 if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) { 388 throw new ImageWriteException("Invalid IPTC block type."); 389 } 390 bos.write2Bytes(block.getBlockType()); 391 392 final byte[] blockNameBytes = block.getBlockNameBytes(); 393 if (blockNameBytes.length > 255) { 394 throw new ImageWriteException("IPTC block name is too long: " + blockNameBytes.length); 395 } 396 bos.write(blockNameBytes.length); 397 bos.write(blockNameBytes); 398 if (blockNameBytes.length % 2 == 0) { 399 bos.write(0); // pad to even size, including length byte. 400 } 401 402 final byte[] blockData = block.getBlockData(); 403 if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) { 404 throw new ImageWriteException("IPTC block data is too long: " + blockData.length); 405 } 406 bos.write4Bytes(blockData.length); 407 bos.write(blockData); 408 if (blockData.length % 2 == 1) { 409 bos.write(0); // pad to even size 410 } 411 } 412 413 bos.flush(); 414 return os.toByteArray(); 415 } 416 417 public byte[] writeIPTCBlock(List<IptcRecord> elements) 418 throws ImageWriteException, IOException { 419 Charset charset = DEFAULT_CHARSET; 420 for (final IptcRecord element : elements) { 421 final byte[] recordData = element.getValue().getBytes(charset); 422 if (!new String(recordData, charset).equals(element.getValue())) { 423 charset = StandardCharsets.UTF_8; 424 break; 425 } 426 } 427 byte[] blockData; 428 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 429 try (BinaryOutputStream bos = new BinaryOutputStream(baos, getByteOrder())) { 430 if (!charset.equals(DEFAULT_CHARSET)) { 431 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 432 bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER); 433 bos.write(ENV_TAG_CODED_CHARACTER_SET); 434 byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE; 435 bos.write2Bytes(codedCharset.length); 436 bos.write(codedCharset); 437 } 438 439 // first, right record version record 440 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 441 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 442 bos.write(IptcTypes.RECORD_VERSION.type); // record version record 443 // type. 444 bos.write2Bytes(2); // record version record size 445 bos.write2Bytes(2); // record version value 446 447 // make a copy of the list. 448 elements = new ArrayList<>(elements); 449 450 // sort the list. Records must be in numerical order. 451 final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType(); 452 elements.sort(comparator); 453 // TODO: make sure order right 454 455 // write the list. 456 for (final IptcRecord element : elements) { 457 if (element.iptcType == IptcTypes.RECORD_VERSION) { 458 continue; // ignore 459 } 460 461 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 462 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 463 if (element.iptcType.getType() < 0 464 || element.iptcType.getType() > 0xff) { 465 throw new ImageWriteException("Invalid record type: " 466 + element.iptcType.getType()); 467 } 468 bos.write(element.iptcType.getType()); 469 470 final byte[] recordData = element.getValue().getBytes(charset); 471 /* 472 if (!new String(recordData, charset).equals(element.getValue())) { 473 throw new ImageWriteException( 474 "Invalid record value, not " + charset.name()); 475 } 476 */ 477 478 bos.write2Bytes(recordData.length); 479 bos.write(recordData); 480 } 481 } 482 483 blockData = baos.toByteArray(); 484 485 return blockData; 486 } 487 488 private Charset findCharset(byte[] codedCharset) { 489 String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1); 490 try { 491 if (Charset.isSupported(codedCharsetString)) { 492 return Charset.forName(codedCharsetString); 493 } 494 } catch (IllegalArgumentException e) { } 495 // check if encoding is a escape sequence 496 // normalize encoding byte sequence 497 byte[] codedCharsetNormalized = new byte[codedCharset.length]; 498 int j = 0; 499 for (int i = 0; i < codedCharset.length; i++) { 500 if (codedCharset[i] != ' ') { 501 codedCharsetNormalized[j++] = codedCharset[i]; 502 } 503 } 504 505 if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) { 506 return StandardCharsets.UTF_8; 507 } 508 return DEFAULT_CHARSET; 509 } 510 511}