001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.imaging.formats.jpeg.iptc;
019
020import static org.apache.commons.imaging.common.BinaryFunctions.read2Bytes;
021import static org.apache.commons.imaging.common.BinaryFunctions.read4Bytes;
022import static org.apache.commons.imaging.common.BinaryFunctions.readByte;
023import static org.apache.commons.imaging.common.BinaryFunctions.readBytes;
024import static org.apache.commons.imaging.common.BinaryFunctions.slice;
025import static org.apache.commons.imaging.common.BinaryFunctions.startsWith;
026
027import java.io.ByteArrayInputStream;
028import java.io.ByteArrayOutputStream;
029import java.io.IOException;
030import java.io.InputStream;
031import java.nio.ByteOrder;
032import java.nio.charset.Charset;
033import java.nio.charset.StandardCharsets;
034import java.util.ArrayList;
035import java.util.Arrays;
036import java.util.Comparator;
037import java.util.List;
038import java.util.Objects;
039import java.util.logging.Level;
040import java.util.logging.Logger;
041
042import org.apache.commons.imaging.ImageReadException;
043import org.apache.commons.imaging.ImageWriteException;
044import org.apache.commons.imaging.ImagingConstants;
045import org.apache.commons.imaging.ImagingParameters;
046import org.apache.commons.imaging.common.BinaryFileParser;
047import org.apache.commons.imaging.common.BinaryFunctions;
048import org.apache.commons.imaging.common.BinaryOutputStream;
049import org.apache.commons.imaging.common.ByteConversions;
050import org.apache.commons.imaging.formats.jpeg.JpegConstants;
051import org.apache.commons.imaging.internal.Debug;
052
053public class IptcParser extends BinaryFileParser {
054
055    private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName());
056
057    private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN;
058
059    /**
060     * Block types (or Image Resource IDs) that are not recommended to be
061     * interpreted when libraries process Photoshop IPTC metadata.
062     *
063     * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/"> Adobe Photoshop File Formats Specification</a>
064     * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246>IMAGING-246</a>
065     * @since 1.0-alpha2
066     */
067    private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087);
068
069    private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
070    private static final int ENV_TAG_CODED_CHARACTER_SET = 90;
071    private static final byte[] CHARACTER_ESCAPE_SEQUENCE = {'\u001B', '%', 'G'};
072
073    public IptcParser() {
074        setByteOrder(ByteOrder.BIG_ENDIAN);
075    }
076
077    public boolean isPhotoshopJpegSegment(final byte[] segmentData) {
078        if (!startsWith(segmentData,
079                JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING)) {
080            return false;
081        }
082
083        final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size();
084        return (index + 4) <= segmentData.length
085                && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM;
086    }
087
088    /*
089     * In practice, App13 segments are only used for Photoshop/IPTC metadata.
090     * However, we should not treat App13 signatures without Photoshop's
091     * signature as Photoshop/IPTC segments.
092     *
093     * A Photoshop/IPTC App13 segment begins with the Photoshop Identification
094     * string.
095     *
096     * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks").
097     *
098     * Each block has the following structure:
099     *
100     * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13
101     * segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka.
102     * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This
103     * is padded to have an even length. 4. 4-byte size (in bytes). 5. Block
104     * data. This is also padded to have an even length.
105     *
106     * The block data consists of a 0-N records. A record has the following
107     * structure:
108     *
109     * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The
110     * record types are documented by the IPTC. See IptcConstants. 3. 2-byte
111     * record size (in bytes). 4. Record data, "record size" bytes long.
112     *
113     * Record data (unlike block data) is NOT padded to have an even length.
114     *
115     * Record data, for IPTC record, should always be ISO-8859-1. But according
116     * to SANSELAN-33, this isn't always the case.
117     *
118     * The exception is the first record in the block, which must always be a
119     * record version record, whose value is a two-byte number; the value is
120     * 0x02.
121     *
122     * Some IPTC blocks are missing this first "record version" record, so we
123     * don't require it.
124     */
125    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters params)
126            throws ImageReadException, IOException {
127        final boolean strict =  params != null && params.isStrict();
128
129        return parsePhotoshopSegment(bytes, strict);
130    }
131
132    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImageReadException,
133            IOException {
134        final List<IptcRecord> records = new ArrayList<>();
135
136        final List<IptcBlock> blocks = parseAllBlocks(bytes, strict);
137
138        for (final IptcBlock block : blocks) {
139            // Ignore everything but IPTC data.
140            if (!block.isIPTCBlock()) {
141                continue;
142            }
143
144            records.addAll(parseIPTCBlock(block.getBlockData()));
145        }
146
147        return new PhotoshopApp13Data(records, blocks);
148    }
149
150    protected List<IptcRecord> parseIPTCBlock(final byte[] bytes) {
151        Charset charset = DEFAULT_CHARSET;
152        final List<IptcRecord> elements = new ArrayList<>();
153
154        int index = 0;
155        // Integer recordVersion = null;
156        while (index + 1 < bytes.length) {
157            final int tagMarker = 0xff & bytes[index++];
158            Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")");
159
160            if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) {
161                if (LOGGER.isLoggable(Level.FINE)) {
162                    LOGGER.fine("Unexpected record tag marker in IPTC data.");
163                }
164                return elements;
165            }
166
167            final int recordNumber = 0xff & bytes[index++];
168            Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")");
169
170            // int recordPrefix = convertByteArrayToShort("recordPrefix", index,
171            // bytes);
172            // if (verbose)
173            // Debug.debug("recordPrefix", recordPrefix + " (0x"
174            // + Integer.toHexString(recordPrefix) + ")");
175            // index += 2;
176            //
177            // if (recordPrefix != IPTC_RECORD_PREFIX)
178            // {
179            // if (verbose)
180            // System.out
181            // .println("Unexpected record prefix in IPTC data!");
182            // return elements;
183            // }
184
185            // throw new ImageReadException(
186            // "Unexpected record prefix in IPTC data.");
187
188            final int recordType = 0xff & bytes[index];
189            Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")");
190            index++;
191
192            final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder());
193            index += 2;
194
195            final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE;
196            final int dataFieldCountLength = recordSize & 0x7fff;
197            if (extendedDataset) {
198                Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength);
199            }
200            if (extendedDataset) {
201                // ignore extended dataset and everything after.
202                return elements;
203            }
204
205            final byte[] recordData = slice(bytes, index, recordSize);
206            index += recordSize;
207
208            // Debug.debug("recordSize", recordSize + " (0x"
209            // + Integer.toHexString(recordSize) + ")");
210
211            if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) {
212                charset = findCharset(recordData);
213                continue;
214            }
215
216            if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) {
217                continue;
218            }
219
220            if (recordType == 0) {
221                if (LOGGER.isLoggable(Level.FINE)) {
222                    LOGGER.fine("ignore record version record! " + elements.size());
223                }
224                // ignore "record version" record;
225                continue;
226            }
227            // if (recordVersion == null)
228            // {
229            // // The first record in a JPEG/Photoshop IPTC block must be
230            // // the record version.
231            // if (recordType != 0)
232            // throw new ImageReadException("Missing record version: "
233            // + recordType);
234            // recordVersion = new Integer(convertByteArrayToShort(
235            // "recordNumber", recordData));
236            //
237            // if (recordSize != 2)
238            // throw new ImageReadException(
239            // "Invalid record version record size: " + recordSize);
240            //
241            // // JPEG/Photoshop IPTC metadata is always in Record version
242            // // 2
243            // if (recordVersion.intValue() != 2)
244            // throw new ImageReadException(
245            // "Invalid IPTC record version: " + recordVersion);
246            //
247            // // Debug.debug("recordVersion", recordVersion);
248            // continue;
249            // }
250
251            final String value = new String(recordData, charset);
252
253            final IptcType iptcType = IptcTypeLookup.getIptcType(recordType);
254
255            // Debug.debug("iptcType", iptcType);
256            // debugByteArray("iptcData", iptcData);
257            // Debug.debug();
258
259            // if (recordType == IPTC_TYPE_CREDIT.type
260            // || recordType == IPTC_TYPE_OBJECT_NAME.type)
261            // {
262            // this.debugByteArray("recordData", recordData);
263            // Debug.debug("index", IPTC_TYPE_CREDIT.name);
264            // }
265
266            final IptcRecord element = new IptcRecord(iptcType, value);
267            elements.add(element);
268        }
269
270        return elements;
271    }
272
273    protected List<IptcBlock> parseAllBlocks(final byte[] bytes,
274            final boolean strict) throws ImageReadException, IOException {
275        final List<IptcBlock> blocks = new ArrayList<>();
276
277        try (InputStream bis = new ByteArrayInputStream(bytes)) {
278
279            // Note that these are unsigned quantities. Name is always an even
280            // number of bytes (including the 1st byte, which is the size.)
281
282            final byte[] idString = readBytes("", bis,
283                    JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(),
284                    "App13 Segment missing identification string");
285            if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) {
286                throw new ImageReadException("Not a Photoshop App13 Segment");
287            }
288
289            // int index = PHOTOSHOP_IDENTIFICATION_STRING.length;
290
291            while (true) {
292                final int imageResourceBlockSignature;
293                try {
294                    imageResourceBlockSignature = read4Bytes("", bis,
295                            "Image Resource Block missing identification string", APP13_BYTE_ORDER);
296                } catch (final IOException ioEx) {
297                    break;
298                }
299                if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) {
300                    throw new ImageReadException(
301                            "Invalid Image Resource Block Signature");
302                }
303
304                final int blockType = read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER);
305                Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
306
307                // skip blocks that the photoshop spec recommends to, see IMAGING-246
308                if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) {
309                    Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
310                    // if there is still data in this block, before the next image resource block
311                    // (8BIM), then we must consume these bytes to leave a pointer ready to read
312                    // the next block
313                    BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis);
314                    continue;
315                }
316
317                final int blockNameLength = readByte("Name length", bis, "Image Resource Block missing name length");
318                if (blockNameLength > 0) {
319                    Debug.debug("blockNameLength: " + blockNameLength + " (0x"
320                            + Integer.toHexString(blockNameLength) + ")");
321                }
322                byte[] blockNameBytes;
323                if (blockNameLength == 0) {
324                    readByte("Block name bytes", bis, "Image Resource Block has invalid name");
325                    blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY;
326                } else {
327                    try {
328                        blockNameBytes = readBytes("", bis, blockNameLength,
329                                "Invalid Image Resource Block name");
330                    } catch (final IOException ioEx) {
331                        if (strict) {
332                            throw ioEx;
333                        }
334                        break;
335                    }
336
337                    if (blockNameLength % 2 == 0) {
338                        readByte("Padding byte", bis, "Image Resource Block missing padding byte");
339                    }
340                }
341
342                final int blockSize = read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER);
343                Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")");
344
345                /*
346                 * doesn't catch cases where blocksize is invalid but is still less
347                 * than bytes.length but will at least prevent OutOfMemory errors
348                 */
349                if (blockSize > bytes.length) {
350                    throw new ImageReadException("Invalid Block Size : " + blockSize + " > " + bytes.length);
351                }
352
353                final byte[] blockData;
354                try {
355                    blockData = readBytes("", bis, blockSize, "Invalid Image Resource Block data");
356                } catch (final IOException ioEx) {
357                    if (strict) {
358                        throw ioEx;
359                    }
360                    break;
361                }
362
363                blocks.add(new IptcBlock(blockType, blockNameBytes, blockData));
364
365                if ((blockSize % 2) != 0) {
366                    readByte("Padding byte", bis, "Image Resource Block missing padding byte");
367                }
368            }
369
370            return blocks;
371        }
372    }
373
374    // private void writeIPTCRecord(BinaryOutputStream bos, )
375
376    public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data)
377            throws IOException, ImageWriteException {
378        final ByteArrayOutputStream os = new ByteArrayOutputStream();
379        final BinaryOutputStream bos = new BinaryOutputStream(os);
380
381        JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos);
382
383        final List<IptcBlock> blocks = data.getRawBlocks();
384        for (final IptcBlock block : blocks) {
385            bos.write4Bytes(JpegConstants.CONST_8BIM);
386
387            if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) {
388                throw new ImageWriteException("Invalid IPTC block type.");
389            }
390            bos.write2Bytes(block.getBlockType());
391
392            final byte[] blockNameBytes = block.getBlockNameBytes();
393            if (blockNameBytes.length > 255) {
394                throw new ImageWriteException("IPTC block name is too long: " + blockNameBytes.length);
395            }
396            bos.write(blockNameBytes.length);
397            bos.write(blockNameBytes);
398            if (blockNameBytes.length % 2 == 0) {
399                bos.write(0); // pad to even size, including length byte.
400            }
401
402            final byte[] blockData = block.getBlockData();
403            if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) {
404                throw new ImageWriteException("IPTC block data is too long: " + blockData.length);
405            }
406            bos.write4Bytes(blockData.length);
407            bos.write(blockData);
408            if (blockData.length % 2 == 1) {
409                bos.write(0); // pad to even size
410            }
411        }
412
413        bos.flush();
414        return os.toByteArray();
415    }
416
417    public byte[] writeIPTCBlock(List<IptcRecord> elements)
418            throws ImageWriteException, IOException {
419        Charset charset = DEFAULT_CHARSET;
420        for (final IptcRecord element : elements) {
421            final byte[] recordData = element.getValue().getBytes(charset);
422            if (!new String(recordData, charset).equals(element.getValue())) {
423                charset = StandardCharsets.UTF_8;
424                break;
425            }
426        }
427        byte[] blockData;
428        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
429        try (BinaryOutputStream bos = new BinaryOutputStream(baos, getByteOrder())) {
430            if (!charset.equals(DEFAULT_CHARSET)) {
431                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
432                bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER);
433                bos.write(ENV_TAG_CODED_CHARACTER_SET);
434                byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE;
435                bos.write2Bytes(codedCharset.length);
436                bos.write(codedCharset);
437            }
438
439            // first, right record version record
440            bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
441            bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
442            bos.write(IptcTypes.RECORD_VERSION.type); // record version record
443                                                      // type.
444            bos.write2Bytes(2); // record version record size
445            bos.write2Bytes(2); // record version value
446
447            // make a copy of the list.
448            elements = new ArrayList<>(elements);
449
450            // sort the list. Records must be in numerical order.
451            final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType();
452            elements.sort(comparator);
453            // TODO: make sure order right
454
455            // write the list.
456            for (final IptcRecord element : elements) {
457                if (element.iptcType == IptcTypes.RECORD_VERSION) {
458                    continue; // ignore
459                }
460
461                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
462                bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
463                if (element.iptcType.getType() < 0
464                        || element.iptcType.getType() > 0xff) {
465                    throw new ImageWriteException("Invalid record type: "
466                            + element.iptcType.getType());
467                }
468                bos.write(element.iptcType.getType());
469
470                final byte[] recordData = element.getValue().getBytes(charset);
471                /*
472                if (!new String(recordData, charset).equals(element.getValue())) {
473                    throw new ImageWriteException(
474                            "Invalid record value, not " + charset.name());
475                }
476                */
477
478                bos.write2Bytes(recordData.length);
479                bos.write(recordData);
480            }
481        }
482
483        blockData = baos.toByteArray();
484
485        return blockData;
486    }
487
488    private Charset findCharset(byte[] codedCharset) {
489        String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1);
490        try {
491            if (Charset.isSupported(codedCharsetString)) {
492                return Charset.forName(codedCharsetString);
493            }
494        } catch (IllegalArgumentException e) { }
495        // check if encoding is a escape sequence
496        // normalize encoding byte sequence
497        byte[] codedCharsetNormalized = new byte[codedCharset.length];
498        int j = 0;
499        for (int i = 0; i < codedCharset.length; i++) {
500            if (codedCharset[i] != ' ') {
501                codedCharsetNormalized[j++] = codedCharset[i];
502            }
503        }
504
505        if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) {
506            return StandardCharsets.UTF_8;
507        }
508        return DEFAULT_CHARSET;
509    }
510
511}