001/* 002 * $Id: InlineImageUtils.java 4832 2011-05-04 13:35:36Z blowagie $ 003 * 004 * This file is part of the iText (R) project. 005 * Copyright (c) 1998-2011 1T3XT BVBA 006 * Authors: Bruno Lowagie, Kevin Day, Paulo Soares, et al. 007 * 008 * This program is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Affero General Public License version 3 010 * as published by the Free Software Foundation with the addition of the 011 * following permission added to Section 15 as permitted in Section 7(a): 012 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT, 013 * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS. 014 * 015 * This program is distributed in the hope that it will be useful, but 016 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 017 * or FITNESS FOR A PARTICULAR PURPOSE. 018 * See the GNU Affero General Public License for more details. 019 * You should have received a copy of the GNU Affero General Public License 020 * along with this program; if not, see http://www.gnu.org/licenses or write to 021 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 022 * Boston, MA, 02110-1301 USA, or download the license from the following URL: 023 * http://itextpdf.com/terms-of-use/ 024 * 025 * The interactive user interfaces in modified source and object code versions 026 * of this program must display Appropriate Legal Notices, as required under 027 * Section 5 of the GNU Affero General Public License. 028 * 029 * In accordance with Section 7(b) of the GNU Affero General Public License, 030 * a covered work must retain the producer line in every PDF that is created 031 * or manipulated using iText. 032 * 033 * You can be released from the requirements of the license by purchasing 034 * a commercial license. Buying such a license is mandatory as soon as you 035 * develop commercial activities involving the iText software without 036 * disclosing the source code of your own applications. 037 * These activities include: offering paid services to customers as an ASP, 038 * serving PDFs on the fly in a web application, shipping iText with a closed 039 * source product. 040 * 041 * For more information, please contact iText Software Corp. at this 042 * address: sales@itextpdf.com 043 */ 044package com.itextpdf.text.pdf.parser; 045 046import java.io.ByteArrayOutputStream; 047import java.io.IOException; 048import java.util.HashMap; 049import java.util.Map; 050 051import com.itextpdf.text.pdf.PRTokeniser; 052import com.itextpdf.text.pdf.PdfArray; 053import com.itextpdf.text.pdf.PdfContentParser; 054import com.itextpdf.text.pdf.PdfDictionary; 055import com.itextpdf.text.pdf.PdfName; 056import com.itextpdf.text.pdf.PdfNumber; 057import com.itextpdf.text.pdf.PdfObject; 058 059/** 060 * Utility methods to help with processing of inline images 061 * @since 5.0.4 062 */ 063public final class InlineImageUtils { 064 private InlineImageUtils(){} 065 066 /** 067 * Simple class in case users need to differentiate an exception from processing 068 * inline images vs other exceptions 069 * @since 5.0.4 070 */ 071 public static class InlineImageParseException extends IOException{ 072 073 private static final long serialVersionUID = 233760879000268548L; 074 075 public InlineImageParseException(String message) { 076 super(message); 077 } 078 079 } 080 081 /** 082 * Map between key abbreviations allowed in dictionary of inline images and their 083 * equivalent image dictionary keys 084 */ 085 private final static Map<PdfName, PdfName> inlineImageEntryAbbreviationMap; 086 static { // static initializer 087 inlineImageEntryAbbreviationMap = new HashMap<PdfName, PdfName>(); 088 089 // allowed entries - just pass these through 090 inlineImageEntryAbbreviationMap.put(PdfName.BITSPERCOMPONENT, PdfName.BITSPERCOMPONENT); 091 inlineImageEntryAbbreviationMap.put(PdfName.COLORSPACE, PdfName.COLORSPACE); 092 inlineImageEntryAbbreviationMap.put(PdfName.DECODE, PdfName.DECODE); 093 inlineImageEntryAbbreviationMap.put(PdfName.DECODEPARMS, PdfName.DECODEPARMS); 094 inlineImageEntryAbbreviationMap.put(PdfName.FILTER, PdfName.FILTER); 095 inlineImageEntryAbbreviationMap.put(PdfName.HEIGHT, PdfName.HEIGHT); 096 inlineImageEntryAbbreviationMap.put(PdfName.IMAGEMASK, PdfName.IMAGEMASK); 097 inlineImageEntryAbbreviationMap.put(PdfName.INTENT, PdfName.INTENT); 098 inlineImageEntryAbbreviationMap.put(PdfName.INTERPOLATE, PdfName.INTERPOLATE); 099 inlineImageEntryAbbreviationMap.put(PdfName.WIDTH, PdfName.WIDTH); 100 101 // abbreviations - transform these to corresponding correct values 102 inlineImageEntryAbbreviationMap.put(new PdfName("BPC"), PdfName.BITSPERCOMPONENT); 103 inlineImageEntryAbbreviationMap.put(new PdfName("CS"), PdfName.COLORSPACE); 104 inlineImageEntryAbbreviationMap.put(new PdfName("D"), PdfName.DECODE); 105 inlineImageEntryAbbreviationMap.put(new PdfName("DP"), PdfName.DECODEPARMS); 106 inlineImageEntryAbbreviationMap.put(new PdfName("F"), PdfName.FILTER); 107 inlineImageEntryAbbreviationMap.put(new PdfName("H"), PdfName.HEIGHT); 108 inlineImageEntryAbbreviationMap.put(new PdfName("IM"), PdfName.IMAGEMASK); 109 inlineImageEntryAbbreviationMap.put(new PdfName("I"), PdfName.INTERPOLATE); 110 inlineImageEntryAbbreviationMap.put(new PdfName("W"), PdfName.WIDTH); 111 } 112 113 /** 114 * Map between value abbreviations allowed in dictionary of inline images for COLORSPACE 115 */ 116 private static final Map<PdfName, PdfName> inlineImageColorSpaceAbbreviationMap; 117 static { 118 inlineImageColorSpaceAbbreviationMap = new HashMap<PdfName, PdfName>(); 119 120 inlineImageColorSpaceAbbreviationMap.put(new PdfName("G"), PdfName.DEVICEGRAY); 121 inlineImageColorSpaceAbbreviationMap.put(new PdfName("RGB"), PdfName.DEVICERGB); 122 inlineImageColorSpaceAbbreviationMap.put(new PdfName("CMYK"), PdfName.DEVICECMYK); 123 inlineImageColorSpaceAbbreviationMap.put(new PdfName("I"), PdfName.INDEXED); 124 } 125 126 /** 127 * Map between value abbreviations allowed in dictionary of inline images for FILTER 128 */ 129 private static final Map<PdfName, PdfName> inlineImageFilterAbbreviationMap; 130 static { 131 inlineImageFilterAbbreviationMap = new HashMap<PdfName, PdfName>(); 132 133 inlineImageFilterAbbreviationMap.put(new PdfName("AHx"), PdfName.ASCIIHEXDECODE); 134 inlineImageFilterAbbreviationMap.put(new PdfName("A85"), PdfName.ASCII85DECODE); 135 inlineImageFilterAbbreviationMap.put(new PdfName("LZW"), PdfName.LZWDECODE); 136 inlineImageFilterAbbreviationMap.put(new PdfName("Fl"), PdfName.FLATEDECODE); 137 inlineImageFilterAbbreviationMap.put(new PdfName("RL"), PdfName.RUNLENGTHDECODE); 138 inlineImageFilterAbbreviationMap.put(new PdfName("CCF"), PdfName.CCITTFAXDECODE); 139 inlineImageFilterAbbreviationMap.put(new PdfName("DCT"), PdfName.DCTDECODE); 140 } 141 142 /** 143 * Parses an inline image from the provided content parser. The parser must be positioned immediately following the BI operator in the content stream. 144 * The parser will be left with current position immediately following the EI operator that terminates the inline image 145 * @param ps the content parser to use for reading the image. 146 * @param colorSpaceDic a color space dictionary 147 * @return the parsed image 148 * @throws IOException if anything goes wring with the parsing 149 * @throws InlineImageParseException if parsing of the inline image failed due to issues specific to inline image processing 150 */ 151 public static PdfImageObject parseInlineImage(PdfContentParser ps, PdfDictionary colorSpaceDic) throws IOException{ 152 PdfDictionary inlineImageDictionary = parseInlineImageDictionary(ps); 153 byte[] samples = parseInlineImageSamples(inlineImageDictionary, colorSpaceDic, ps); 154 return new PdfImageObject(inlineImageDictionary, samples); 155 } 156 157 /** 158 * Parses the next inline image dictionary from the parser. The parser must be positioned immediately following the EI operator. 159 * The parser will be left with position immediately following the whitespace character that follows the ID operator that ends the inline image dictionary. 160 * @param ps the parser to extract the embedded image information from 161 * @return the dictionary for the inline image, with any abbreviations converted to regular image dictionary keys and values 162 * @throws IOException if the parse fails 163 */ 164 private static PdfDictionary parseInlineImageDictionary(PdfContentParser ps) throws IOException{ 165 // by the time we get to here, we have already parsed the BI operator 166 PdfDictionary dictionary = new PdfDictionary(); 167 168 for(PdfObject key = ps.readPRObject(); key != null && !"ID".equals(key.toString()); key = ps.readPRObject()){ 169 PdfObject value = ps.readPRObject(); 170 171 PdfName resolvedKey = inlineImageEntryAbbreviationMap.get(key); 172 if (resolvedKey == null) 173 resolvedKey = (PdfName)key; 174 175 dictionary.put(resolvedKey, getAlternateValue(resolvedKey, value)); 176 } 177 178 int ch = ps.getTokeniser().read(); 179 if (!PRTokeniser.isWhitespace(ch)) 180 throw new IOException("Unexpected character " + ch + " found after ID in inline image"); 181 182 return dictionary; 183 } 184 185 /** 186 * Transforms value abbreviations into their corresponding real value 187 * @param key the key that the value is for 188 * @param value the value that might be an abbreviation 189 * @return if value is an allowed abbreviation for the key, the expanded value for that abbreviation. Otherwise, value is returned without modification 190 */ 191 private static PdfObject getAlternateValue(PdfName key, PdfObject value){ 192 if (key == PdfName.FILTER){ 193 if (value instanceof PdfName){ 194 PdfName altValue = inlineImageFilterAbbreviationMap.get(value); 195 if (altValue != null) 196 return altValue; 197 } else if (value instanceof PdfArray){ 198 PdfArray array = ((PdfArray)value); 199 PdfArray altArray = new PdfArray(); 200 int count = array.size(); 201 for(int i = 0; i < count; i++){ 202 altArray.add(getAlternateValue(key, array.getPdfObject(i))); 203 } 204 return altArray; 205 } 206 } else if (key == PdfName.COLORSPACE){ 207 PdfName altValue = inlineImageColorSpaceAbbreviationMap.get(value); 208 if (altValue != null) 209 return altValue; 210 } 211 212 return value; 213 } 214 215 /** 216 * @param colorSpaceName the name of the color space. If null, a bi-tonal (black and white) color space is assumed. 217 * @return the components per pixel for the specified color space 218 */ 219 private static int getComponentsPerPixel(PdfName colorSpaceName, PdfDictionary colorSpaceDic){ 220 if (colorSpaceName == null) 221 return 1; 222 if (colorSpaceName.equals(PdfName.DEVICEGRAY)) 223 return 1; 224 if (colorSpaceName.equals(PdfName.DEVICERGB)) 225 return 3; 226 if (colorSpaceName.equals(PdfName.DEVICECMYK)) 227 return 4; 228 229 if (colorSpaceDic != null){ 230 PdfArray colorSpace = colorSpaceDic.getAsArray(colorSpaceName); 231 if (colorSpace != null){ 232 if (PdfName.INDEXED.equals(colorSpace.getAsName(0))){ 233 return 1; 234 } 235 } 236 } 237 238 throw new IllegalArgumentException("Unexpected color space " + colorSpaceName); 239 } 240 241 /** 242 * Computes the number of unfiltered bytes that each row of the image will contain. 243 * If the number of bytes results in a partial terminating byte, this number is rounded up 244 * per the PDF specification 245 * @param imageDictionary the dictionary of the inline image 246 * @return the number of bytes per row of the image 247 */ 248 private static int computeBytesPerRow(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic){ 249 PdfNumber wObj = imageDictionary.getAsNumber(PdfName.WIDTH); 250 PdfNumber bpcObj = imageDictionary.getAsNumber(PdfName.BITSPERCOMPONENT); 251 int cpp = getComponentsPerPixel(imageDictionary.getAsName(PdfName.COLORSPACE), colorSpaceDic); 252 253 int w = wObj.intValue(); 254 int bpc = bpcObj != null ? bpcObj.intValue() : 1; 255 256 257 int bytesPerRow = (w * bpc * cpp + 7) / 8; 258 259 return bytesPerRow; 260 } 261 262 /** 263 * Parses the samples of the image from the underlying content parser, ignoring all filters. 264 * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. 265 * The parser will be left positioned immediately following the EI operator. 266 * This is primarily useful if no filters have been applied. 267 * @param imageDictionary the dictionary of the inline image 268 * @param ps the content parser 269 * @return the samples of the image 270 * @throws IOException if anything bad happens during parsing 271 */ 272 private static byte[] parseUnfilteredSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) throws IOException{ 273 // special case: when no filter is specified, we just read the number of bits 274 // per component, multiplied by the width and height. 275 if (imageDictionary.contains(PdfName.FILTER)) 276 throw new IllegalArgumentException("Dictionary contains filters"); 277 278 PdfNumber h = imageDictionary.getAsNumber(PdfName.HEIGHT); 279 280 int bytesToRead = computeBytesPerRow(imageDictionary, colorSpaceDic) * h.intValue(); 281 byte[] bytes = new byte[bytesToRead]; 282 PRTokeniser tokeniser = ps.getTokeniser(); 283 284 int shouldBeWhiteSpace = tokeniser.read(); // skip next character (which better be a whitespace character - I suppose we could check for this) 285 // from the PDF spec: Unless the image uses ASCIIHexDecode or ASCII85Decode as one of its filters, the ID operator shall be followed by a single white-space character, and the next character shall be interpreted as the first byte of image data. 286 // unfortunately, we've seen some PDFs where there is no space following the ID, so we have to capture this case and handle it 287 int startIndex = 0; 288 if (!PRTokeniser.isWhitespace(shouldBeWhiteSpace)){ 289 bytes[0] = (byte)shouldBeWhiteSpace; 290 startIndex++; 291 } 292 for(int i = startIndex; i < bytesToRead; i++){ 293 int ch = tokeniser.read(); 294 if (ch == -1) 295 throw new InlineImageParseException("End of content stream reached before end of image data"); 296 297 bytes[i] = (byte)ch; 298 } 299 PdfObject ei = ps.readPRObject(); 300 if (!ei.toString().equals("EI")) 301 throw new InlineImageParseException("EI not found after end of image data"); 302 303 return bytes; 304 } 305 306 /** 307 * Parses the samples of the image from the underlying content parser, accounting for filters 308 * The parser must be positioned immediately after the ID operator that ends the inline image's dictionary. 309 * The parser will be left positioned immediately following the EI operator. 310 * <b>Note:</b>This implementation does not actually apply the filters at this time 311 * @param imageDictionary the dictionary of the inline image 312 * @param ps the content parser 313 * @return the samples of the image 314 * @throws IOException if anything bad happens during parsing 315 */ 316 private static byte[] parseInlineImageSamples(PdfDictionary imageDictionary, PdfDictionary colorSpaceDic, PdfContentParser ps) throws IOException{ 317 // by the time we get to here, we have already parsed the ID operator 318 319 if (!imageDictionary.contains(PdfName.FILTER)){ 320 return parseUnfilteredSamples(imageDictionary, colorSpaceDic, ps); 321 } 322 323 324 // read all content until we reach an EI operator surrounded by whitespace. 325 // The following algorithm has two potential issues: what if the image stream 326 // contains <ws>EI<ws> ? 327 // Plus, there are some streams that don't have the <ws> before the EI operator 328 // it sounds like we would have to actually decode the content stream, which 329 // I'd rather avoid right now. 330 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 331 ByteArrayOutputStream accumulated = new ByteArrayOutputStream(); 332 int ch; 333 int found = 0; 334 PRTokeniser tokeniser = ps.getTokeniser(); 335 336 while ((ch = tokeniser.read()) != -1){ 337 if (found == 0 && PRTokeniser.isWhitespace(ch)){ 338 found++; 339 accumulated.write(ch); 340 } else if (found == 1 && ch == 'E'){ 341 found++; 342 accumulated.write(ch); 343 } else if (found == 1 && PRTokeniser.isWhitespace(ch)){ 344 // this clause is needed if we have a white space character that is part of the image data 345 // followed by a whitespace character that precedes the EI operator. In this case, we need 346 // to flush the first whitespace, then treat the current whitespace as the first potential 347 // character for the end of stream check. Note that we don't increment 'found' here. 348 baos.write(accumulated.toByteArray()); 349 accumulated.reset(); 350 accumulated.write(ch); 351 } else if (found == 2 && ch == 'I'){ 352 found++; 353 accumulated.write(ch); 354 } else if (found == 3 && PRTokeniser.isWhitespace(ch)){ 355 return baos.toByteArray(); 356 } else { 357 baos.write(accumulated.toByteArray()); 358 accumulated.reset(); 359 360 baos.write(ch); 361 found = 0; 362 } 363 } 364 throw new InlineImageParseException("Could not find image data or EI"); 365 } 366}