1 /* 2 * Copyright 2007-2009 Medsea Business Solutions S.L. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * <p> 18 * </p> 19 * @author Steven McArdle 20 */ 21 package eu.medsea.mimeutil; 22 23 import java.io.BufferedInputStream; 24 import java.io.File; 25 import java.io.FileInputStream; 26 import java.io.IOException; 27 import java.io.InputStream; 28 import java.io.UnsupportedEncodingException; 29 import java.net.URL; 30 import java.util.ArrayList; 31 import java.util.Collection; 32 import java.util.Iterator; 33 34 import org.slf4j.Logger; 35 import org.slf4j.LoggerFactory; 36 37 import eu.medsea.mimeutil.detector.MimeDetector; 38 import eu.medsea.mimeutil.handler.TextMimeHandler; 39 import eu.medsea.util.EncodingGuesser; 40 41 /** 42 * This MimeDetector cannot be registered, unregistered or subclassed. 43 * It is a default MimeDetector that is pre-installed into the mime-util utility and 44 * is used as the FIRST MimeDetector. 45 * <p> 46 * You can influence this MimeDetector in several ways. 47 * <ul> 48 * <li>Specify a different list of preferred encodings using the static TextMimeDetector.setPreferredEncodings(...) method.</li> 49 * <li>Change the list of supported encodings using the static EncodingGuesser.setSupportedEncodings(...) method.</li> 50 * <li>Register TextMimeHandler(s) using the static TextMimeDetector.registerTextMimeHandler(...) method (very, VERY powerful).</li> 51 * </ul> 52 * <p> 53 * The TextMimeDetector.setPreferredEncodings(...) method is used to provide a preferred list of encodings. The final encoding for the MimeType 54 * will be the first one in this list that is also contained in the possible encodings returned from the EncodingGuesser class. If none of 55 * these match then the first entry in the possible encodings collection is used. 56 * </p> 57 * <p> 58 * The EncodingGuesser.setSupportedEncodings(...) method is used to set the list of encodings that will be considered when trying to guess the 59 * encoding. If you provide encodings that are not supported by your JVM an error is logged and the next encoding is tried. If you set this to an 60 * empty Collection then you will effectively turn this MimeDetector OFF (the default). This is the recommended way to disable this MimeDetector. 61 * The most common usage scenario for this method is when your application is designed to support only a limited set of encodings such as 62 * UTF-8 and UTF-16 encoded text files. You can set the supported encodings list to this sub set of encodings and improve the performance 63 * of this MimeDetector greatly. 64 * </p> 65 * <p> 66 * The TextMimeDetector.registerTextMimeHandler(...) method can be used to register special TextMimeHandler(s). These MimeHandler(s) are 67 * delegated to when once valid encodings have been found for the content contained in File, InputStream or byte []. The handlers can influence 68 * both the returned MimeType and encoding of any matched content. For instance, the default behavior is to return a MimeType of text/plain and 69 * encoding set according to the rules above. The Handler(s) allow you to further process the content and decide that it is in fact a text/xml 70 * or application/svg-xml or even mytype/mysubtype. You can also change the assigned encoding as it may be wrong for your new MimeType. 71 * For instance, if you decide the MimeType is really an XML file and not just a standard text/plain file and the detector calculated that the 72 * best encoding is UTF-8 but you detect and encoding attribute in the XML content for ISO-8859-1, you can set this as well thus returning 73 * a TextMimeType of application/xml with an encoding or ISO-8859-1 instead of a TextMimeType of text/plain and an encoding of UTF-8.<br/><br/> 74 * IMPORTANT: Your handler(s) will only get to see and act on content that this MimeDetector thinks is text in the first place. So if your 75 * restrictions on supported encodings will no longer detect a file as text then your handler(s) will never be called. 76 * </p> 77 * </p> 78 * <p> 79 * The methods will do their best to eliminate any binary files before trying to detect an encoding. 80 * However, if a binary file contains only a few bytes of data or you are very unlucky it could be 81 * mistakenly recognised as a text file and processed by this MimeDetector. 82 * </p> 83 * <p> 84 * The Collection(s) returned from the methods in this class will contain either 0 or 1 MimeType entry 85 * of type TextMimeType with a mime type of "text/plain" or whatever matching registered TextMimeHandler(s) decide to return. 86 * You can test for matches from this MimeDetector by using the instanceof operator on the Collection of returned MimeType(s) to your code 87 * (remember, the returned Collection to you is the accumulated collection from ALL registered MimeDetectors. You can retrieve the 88 * encoding using the getEncoding() method of TextMimeType after casting the MimeType to a TextMimeType. 89 * </p> 90 * <p> 91 * You should also remember that if this MimeDetector puts a TextMimeType into the eventual Collection of MimeType(s) returned to your code 92 * of say "text/plain" and one or more of the other registered MimeDetector(s) also add an instance of "text/plain" in accordance with their 93 * detection rules, the type will not be changed from TextMimeType to MimeType. Only the specificity value of the MimeType will be increased 94 * thus improving the likelihood that this MimeType will be returned from the MimeUtil.getMostSpecificMimeType(Collection mimeTypes) method. 95 * </p> 96 * @author Steven McArdle 97 * 98 */ 99 public final class TextMimeDetector extends MimeDetector { 100 101 private static Logger log = LoggerFactory.getLogger(TextMimeDetector.class); 102 103 // The maximum amount of data to retrieve from a stream 104 private static final int BUFFER_SIZE = 1024; 105 106 // No text file should have 2 or more consecutive NULL values 107 private static final int MAX_NULL_VALUES = 1; 108 109 private static Collection preferredEncodings = new ArrayList(); 110 static { 111 TextMimeDetector.setPreferredEncodings(new String [] {"UTF-16", "UTF-8", "ISO-8859-1", "windows-1252", "US-ASCII"} ); 112 } 113 114 // Registered list of TextMimeHandler(s) 115 private static Collection handlers = new ArrayList(); 116 117 // Private so nobody can register one using the MimeUtil.registerMimeDetector(...) method 118 private TextMimeDetector() { 119 } 120 121 // Package scoped so that the class can still be create for use by mime-util without resorting to a singleton approach 122 // Could change this in the future !!! 123 TextMimeDetector(int dummy) { 124 this(); 125 } 126 127 /** 128 * @see MimeDetector.getDescription() 129 */ 130 public String getDescription() { 131 return "Determine if a file or stream contains a text mime type. If so then return TextMimeType with text/plain and the best guess encoding."; 132 } 133 134 /** 135 * This MimeDetector requires content so defer to the file method 136 */ 137 public Collection getMimeTypesFileName(String fileName) 138 throws UnsupportedOperationException { 139 return getMimeTypesFile(new File(fileName)); 140 } 141 142 /** 143 * We only want to deal with the stream from the URL 144 * @see MimeDetector.getMimeTypesURL(URL url) 145 */ 146 public Collection getMimeTypesURL(URL url) 147 throws UnsupportedOperationException { 148 149 InputStream in = null; 150 try { 151 return getMimeTypesInputStream(in = new BufferedInputStream(MimeUtil.getInputStreamForURL(url))); 152 }catch(UnsupportedOperationException e) { 153 throw e; 154 }catch(Exception e) { 155 throw new MimeException(e); 156 }finally { 157 try { 158 in.close(); 159 }catch(Exception ignore) { 160 log.error(ignore.getLocalizedMessage()); 161 } 162 } 163 } 164 165 /** 166 * We only want to deal with the stream for the file 167 * @see MimeDetector.getMimeTypesURL(URL url) 168 */ 169 public Collection getMimeTypesFile(File file) 170 throws UnsupportedOperationException { 171 172 if(!file.exists()) { 173 throw new UnsupportedOperationException("This MimeDetector requires actual content."); 174 } 175 InputStream in = null; 176 try { 177 in = new BufferedInputStream(new FileInputStream(file)); 178 return getMimeTypesInputStream(in); 179 }catch(UnsupportedOperationException e) { 180 throw e; 181 }catch(Exception e) { 182 throw new MimeException(e); 183 }finally { 184 try { 185 in.close(); 186 }catch(Exception ignore) { 187 log.error(ignore.getLocalizedMessage()); 188 } 189 } 190 } 191 192 /** 193 * @see MimeDetector.getMimeTypesInputStream(InputStream in) 194 */ 195 public Collection getMimeTypesInputStream(InputStream in) 196 throws UnsupportedOperationException { 197 198 int offset = 0; 199 int len = TextMimeDetector.BUFFER_SIZE; 200 byte [] data = new byte [len]; 201 byte [] copy = null; 202 // Mark the input stream 203 in.mark(len); 204 205 try { 206 // Since an InputStream might return only some data (not all 207 // requested), we have to read in a loop until 208 // either EOF is reached or the desired number of bytes have been 209 // read. 210 int restBytesToRead = len; 211 while (restBytesToRead > 0) { 212 int bytesRead = in.read(data, offset, restBytesToRead); 213 if (bytesRead < 0) 214 break; // EOF 215 216 offset += bytesRead; 217 restBytesToRead -= bytesRead; 218 } 219 if(offset < len) { 220 copy = new byte[offset]; 221 System.arraycopy( data, 0, copy, 0, offset ); 222 }else { 223 copy = data; 224 } 225 } 226 catch(IOException ioe) { 227 throw new MimeException(ioe); 228 } finally { 229 try { 230 // Reset the input stream to where it was marked. 231 in.reset(); 232 }catch(Exception e) { 233 throw new MimeException(e); 234 } 235 } 236 return getMimeTypesByteArray(copy); 237 } 238 239 /** 240 * @see MimeDetector.getMimeTypesByteArray(byte [] data) 241 */ 242 public Collection getMimeTypesByteArray(byte[] data) 243 throws UnsupportedOperationException { 244 245 // Check if the array contains binary data 246 if(EncodingGuesser.getSupportedEncodings().isEmpty() || isBinary(data)) { 247 throw new UnsupportedOperationException(); 248 } 249 250 Collection mimeTypes = new ArrayList(); 251 252 Collection possibleEncodings = EncodingGuesser.getPossibleEncodings(data); 253 if(log.isDebugEnabled()) { 254 log.debug("Possible encodings [" + possibleEncodings.size() + "] " + possibleEncodings); 255 } 256 257 if(possibleEncodings.isEmpty()) { 258 // Is not a text file understood by this JVM 259 throw new UnsupportedOperationException(); 260 } 261 262 String encoding = null; 263 // Iterate over the preferedEncodings array in the order defined and return the first one found 264 for(Iterator it = TextMimeDetector.preferredEncodings.iterator(); it.hasNext();) { 265 encoding = (String)it.next(); 266 if(possibleEncodings.contains(encoding)) { 267 mimeTypes.add(new TextMimeType("text/plain", encoding)); 268 break; 269 } 270 } 271 // If none of the preferred encodings were acceptable lets see if the default encoding can be used. 272 if(mimeTypes.isEmpty() && possibleEncodings.contains(EncodingGuesser.getDefaultEncoding())) { 273 encoding = EncodingGuesser.getDefaultEncoding(); 274 mimeTypes.add(new TextMimeType("text/plain", encoding)); 275 } 276 277 // If none of our preferredEncodings or the default encoding are in the possible encodings list we return the first possibleEncoding; 278 if(mimeTypes.isEmpty()) { 279 Iterator it = possibleEncodings.iterator(); 280 encoding = (String)it.next(); 281 mimeTypes.add(new TextMimeType("text/plain", encoding)); 282 } 283 284 if(mimeTypes.isEmpty() || handlers.isEmpty()) { 285 // Nothing to handle 286 return mimeTypes; 287 } 288 289 // String will be passed in as is currently in the encoding defined by encoding 290 try { 291 int lengthBOM = EncodingGuesser.getLengthBOM(encoding, data); 292 String content = new String(EncodingGuesser.getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding); 293 return fireMimeHandlers(mimeTypes, content); 294 }catch(UnsupportedEncodingException ignore) { 295 // This should never, never, never happen 296 } 297 return mimeTypes; 298 } 299 300 /** 301 * Change the list of preferred encodings. 302 * This list is used where multiple possible encodings are identified to refer to 303 * the contents in a byte array passed in or read in from a Stream or File object. 304 * 305 * This list is iterated over in order and the first match is set as the encoding for 306 * the text/plain TextMimeType ONLY if the JVM default encoding is not in the list. 307 * 308 * If the neither the defaultEncoding or any of these preferred encodings are in 309 * the list of possible encodings then the first possible encoding will be used. 310 * 311 * @param encodings String array of canonical encoding names. 312 */ 313 public static void setPreferredEncodings(String [] encodings) { 314 TextMimeDetector.preferredEncodings = EncodingGuesser.getValidEncodings(encodings); 315 if(log.isDebugEnabled()) { 316 log.debug("Preferred Encodings set to " + TextMimeDetector.preferredEncodings); 317 } 318 } 319 320 /** 321 * Register a TexMimeHandler(s) 322 * @param handler to register 323 */ 324 public static void registerTextMimeHandler(TextMimeHandler handler) { 325 handlers.add(handler); 326 } 327 328 /** 329 * Unregister a TextMimeHandler 330 * @param handler to unregister 331 */ 332 public static void unregisterTextMimeHandler(TextMimeHandler handler) { 333 handlers.remove(handler); 334 } 335 336 /** 337 * Get the current Collection of registered TexMimeHandler(s) 338 * @return currently registered collection of TextMimeHandler(s) 339 */ 340 public static Collection getRegisteredTextMimeHandlers() { 341 return handlers; 342 } 343 344 /** 345 * Give registered TextMimeHandler(s) the opportunity to influence the 346 * actual mime type before returning from the getMimeTypesXXX(...) methods 347 * @param mimeTypes 348 * @param content 349 * @return 350 */ 351 private Collection fireMimeHandlers(Collection mimeTypes, String content) { 352 // We only have one entry in the mimeTypes Collection due to the way 353 // this MimeDetector works. 354 TextMimeType mimeType = (TextMimeType)mimeTypes.iterator().next(); 355 356 for(Iterator it = handlers.iterator(); it.hasNext(); ) { 357 TextMimeHandler tmh = (TextMimeHandler)it.next(); 358 if(tmh.handle(mimeType, content)) { 359 // The first handler to return true will short circuit the rest of the handlers 360 break; 361 } 362 } 363 return mimeTypes; 364 } 365 366 /* 367 * This is a quick check for the byte array to see if it contains binary data. 368 * 369 * As no known text encoding can have more than MAX_NULL_VALUES consecutive null values the 370 * method does a quick and dirty elimination of what are probably binary files but should never eliminate possible text files. 371 * 372 * It is possible that some binary files will not have MAX_NULL_VALUES consecutive byte 373 * values especially if it's a small file and will slip through here. Later tests should eliminate these. 374 * 375 * We will modify this method to include other known sequences as and when we discover them 376 */ 377 private boolean isBinary(byte [] data) { 378 379 int negCount = 0; 380 381 for(int i = 0; i < data.length; i++) { 382 if(data[i] == 0) { 383 negCount++; 384 } else { 385 negCount = 0; 386 } 387 if(negCount == MAX_NULL_VALUES) { 388 return true; 389 } 390 } 391 return false; 392 } 393 } 394 395