1 /*
2 * Copyright 2007-2009 Medsea Business Solutions S.L.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /**
17 * <p>
18 * </p>
19 * @author Steven McArdle
20 */
21 package eu.medsea.mimeutil;
22
23 import java.io.BufferedInputStream;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.io.UnsupportedEncodingException;
29 import java.net.URL;
30 import java.util.ArrayList;
31 import java.util.Collection;
32 import java.util.Iterator;
33
34 import org.slf4j.Logger;
35 import org.slf4j.LoggerFactory;
36
37 import eu.medsea.mimeutil.detector.MimeDetector;
38 import eu.medsea.mimeutil.handler.TextMimeHandler;
39 import eu.medsea.util.EncodingGuesser;
40
41 /**
42 * This MimeDetector cannot be registered, unregistered or subclassed.
43 * It is a default MimeDetector that is pre-installed into the mime-util utility and
44 * is used as the FIRST MimeDetector.
45 * <p>
46 * You can influence this MimeDetector in several ways.
47 * <ul>
48 * <li>Specify a different list of preferred encodings using the static TextMimeDetector.setPreferredEncodings(...) method.</li>
49 * <li>Change the list of supported encodings using the static EncodingGuesser.setSupportedEncodings(...) method.</li>
50 * <li>Register TextMimeHandler(s) using the static TextMimeDetector.registerTextMimeHandler(...) method (very, VERY powerful).</li>
51 * </ul>
52 * <p>
53 * The TextMimeDetector.setPreferredEncodings(...) method is used to provide a preferred list of encodings. The final encoding for the MimeType
54 * will be the first one in this list that is also contained in the possible encodings returned from the EncodingGuesser class. If none of
55 * these match then the first entry in the possible encodings collection is used.
56 * </p>
57 * <p>
58 * The EncodingGuesser.setSupportedEncodings(...) method is used to set the list of encodings that will be considered when trying to guess the
59 * encoding. If you provide encodings that are not supported by your JVM an error is logged and the next encoding is tried. If you set this to an
60 * empty Collection then you will effectively turn this MimeDetector OFF (the default). This is the recommended way to disable this MimeDetector.
61 * The most common usage scenario for this method is when your application is designed to support only a limited set of encodings such as
62 * UTF-8 and UTF-16 encoded text files. You can set the supported encodings list to this sub set of encodings and improve the performance
63 * of this MimeDetector greatly.
64 * </p>
65 * <p>
66 * The TextMimeDetector.registerTextMimeHandler(...) method can be used to register special TextMimeHandler(s). These MimeHandler(s) are
67 * delegated to when once valid encodings have been found for the content contained in File, InputStream or byte []. The handlers can influence
68 * both the returned MimeType and encoding of any matched content. For instance, the default behavior is to return a MimeType of text/plain and
69 * encoding set according to the rules above. The Handler(s) allow you to further process the content and decide that it is in fact a text/xml
70 * or application/svg-xml or even mytype/mysubtype. You can also change the assigned encoding as it may be wrong for your new MimeType.
71 * For instance, if you decide the MimeType is really an XML file and not just a standard text/plain file and the detector calculated that the
72 * best encoding is UTF-8 but you detect and encoding attribute in the XML content for ISO-8859-1, you can set this as well thus returning
73 * a TextMimeType of application/xml with an encoding or ISO-8859-1 instead of a TextMimeType of text/plain and an encoding of UTF-8.<br/><br/>
74 * IMPORTANT: Your handler(s) will only get to see and act on content that this MimeDetector thinks is text in the first place. So if your
75 * restrictions on supported encodings will no longer detect a file as text then your handler(s) will never be called.
76 * </p>
77 * </p>
78 * <p>
79 * The methods will do their best to eliminate any binary files before trying to detect an encoding.
80 * However, if a binary file contains only a few bytes of data or you are very unlucky it could be
81 * mistakenly recognised as a text file and processed by this MimeDetector.
82 * </p>
83 * <p>
84 * The Collection(s) returned from the methods in this class will contain either 0 or 1 MimeType entry
85 * of type TextMimeType with a mime type of "text/plain" or whatever matching registered TextMimeHandler(s) decide to return.
86 * You can test for matches from this MimeDetector by using the instanceof operator on the Collection of returned MimeType(s) to your code
87 * (remember, the returned Collection to you is the accumulated collection from ALL registered MimeDetectors. You can retrieve the
88 * encoding using the getEncoding() method of TextMimeType after casting the MimeType to a TextMimeType.
89 * </p>
90 * <p>
91 * You should also remember that if this MimeDetector puts a TextMimeType into the eventual Collection of MimeType(s) returned to your code
92 * of say "text/plain" and one or more of the other registered MimeDetector(s) also add an instance of "text/plain" in accordance with their
93 * detection rules, the type will not be changed from TextMimeType to MimeType. Only the specificity value of the MimeType will be increased
94 * thus improving the likelihood that this MimeType will be returned from the MimeUtil.getMostSpecificMimeType(Collection mimeTypes) method.
95 * </p>
96 * @author Steven McArdle
97 *
98 */
99 public final class TextMimeDetector extends MimeDetector {
100
101 private static Logger log = LoggerFactory.getLogger(TextMimeDetector.class);
102
103 // The maximum amount of data to retrieve from a stream
104 private static final int BUFFER_SIZE = 1024;
105
106 // No text file should have 2 or more consecutive NULL values
107 private static final int MAX_NULL_VALUES = 1;
108
109 private static Collection preferredEncodings = new ArrayList();
110 static {
111 TextMimeDetector.setPreferredEncodings(new String [] {"UTF-16", "UTF-8", "ISO-8859-1", "windows-1252", "US-ASCII"} );
112 }
113
114 // Registered list of TextMimeHandler(s)
115 private static Collection handlers = new ArrayList();
116
117 // Private so nobody can register one using the MimeUtil.registerMimeDetector(...) method
118 private TextMimeDetector() {
119 }
120
121 // Package scoped so that the class can still be create for use by mime-util without resorting to a singleton approach
122 // Could change this in the future !!!
123 TextMimeDetector(int dummy) {
124 this();
125 }
126
127 /**
128 * @see MimeDetector.getDescription()
129 */
130 public String getDescription() {
131 return "Determine if a file or stream contains a text mime type. If so then return TextMimeType with text/plain and the best guess encoding.";
132 }
133
134 /**
135 * This MimeDetector requires content so defer to the file method
136 */
137 public Collection getMimeTypesFileName(String fileName)
138 throws UnsupportedOperationException {
139 return getMimeTypesFile(new File(fileName));
140 }
141
142 /**
143 * We only want to deal with the stream from the URL
144 * @see MimeDetector.getMimeTypesURL(URL url)
145 */
146 public Collection getMimeTypesURL(URL url)
147 throws UnsupportedOperationException {
148
149 InputStream in = null;
150 try {
151 return getMimeTypesInputStream(in = new BufferedInputStream(MimeUtil.getInputStreamForURL(url)));
152 }catch(UnsupportedOperationException e) {
153 throw e;
154 }catch(Exception e) {
155 throw new MimeException(e);
156 }finally {
157 try {
158 in.close();
159 }catch(Exception ignore) {
160 log.error(ignore.getLocalizedMessage());
161 }
162 }
163 }
164
165 /**
166 * We only want to deal with the stream for the file
167 * @see MimeDetector.getMimeTypesURL(URL url)
168 */
169 public Collection getMimeTypesFile(File file)
170 throws UnsupportedOperationException {
171
172 if(!file.exists()) {
173 throw new UnsupportedOperationException("This MimeDetector requires actual content.");
174 }
175 InputStream in = null;
176 try {
177 in = new BufferedInputStream(new FileInputStream(file));
178 return getMimeTypesInputStream(in);
179 }catch(UnsupportedOperationException e) {
180 throw e;
181 }catch(Exception e) {
182 throw new MimeException(e);
183 }finally {
184 try {
185 in.close();
186 }catch(Exception ignore) {
187 log.error(ignore.getLocalizedMessage());
188 }
189 }
190 }
191
192 /**
193 * @see MimeDetector.getMimeTypesInputStream(InputStream in)
194 */
195 public Collection getMimeTypesInputStream(InputStream in)
196 throws UnsupportedOperationException {
197
198 int offset = 0;
199 int len = TextMimeDetector.BUFFER_SIZE;
200 byte [] data = new byte [len];
201 byte [] copy = null;
202 // Mark the input stream
203 in.mark(len);
204
205 try {
206 // Since an InputStream might return only some data (not all
207 // requested), we have to read in a loop until
208 // either EOF is reached or the desired number of bytes have been
209 // read.
210 int restBytesToRead = len;
211 while (restBytesToRead > 0) {
212 int bytesRead = in.read(data, offset, restBytesToRead);
213 if (bytesRead < 0)
214 break; // EOF
215
216 offset += bytesRead;
217 restBytesToRead -= bytesRead;
218 }
219 if(offset < len) {
220 copy = new byte[offset];
221 System.arraycopy( data, 0, copy, 0, offset );
222 }else {
223 copy = data;
224 }
225 }
226 catch(IOException ioe) {
227 throw new MimeException(ioe);
228 } finally {
229 try {
230 // Reset the input stream to where it was marked.
231 in.reset();
232 }catch(Exception e) {
233 throw new MimeException(e);
234 }
235 }
236 return getMimeTypesByteArray(copy);
237 }
238
239 /**
240 * @see MimeDetector.getMimeTypesByteArray(byte [] data)
241 */
242 public Collection getMimeTypesByteArray(byte[] data)
243 throws UnsupportedOperationException {
244
245 // Check if the array contains binary data
246 if(EncodingGuesser.getSupportedEncodings().isEmpty() || isBinary(data)) {
247 throw new UnsupportedOperationException();
248 }
249
250 Collection mimeTypes = new ArrayList();
251
252 Collection possibleEncodings = EncodingGuesser.getPossibleEncodings(data);
253 if(log.isDebugEnabled()) {
254 log.debug("Possible encodings [" + possibleEncodings.size() + "] " + possibleEncodings);
255 }
256
257 if(possibleEncodings.isEmpty()) {
258 // Is not a text file understood by this JVM
259 throw new UnsupportedOperationException();
260 }
261
262 String encoding = null;
263 // Iterate over the preferedEncodings array in the order defined and return the first one found
264 for(Iterator it = TextMimeDetector.preferredEncodings.iterator(); it.hasNext();) {
265 encoding = (String)it.next();
266 if(possibleEncodings.contains(encoding)) {
267 mimeTypes.add(new TextMimeType("text/plain", encoding));
268 break;
269 }
270 }
271 // If none of the preferred encodings were acceptable lets see if the default encoding can be used.
272 if(mimeTypes.isEmpty() && possibleEncodings.contains(EncodingGuesser.getDefaultEncoding())) {
273 encoding = EncodingGuesser.getDefaultEncoding();
274 mimeTypes.add(new TextMimeType("text/plain", encoding));
275 }
276
277 // If none of our preferredEncodings or the default encoding are in the possible encodings list we return the first possibleEncoding;
278 if(mimeTypes.isEmpty()) {
279 Iterator it = possibleEncodings.iterator();
280 encoding = (String)it.next();
281 mimeTypes.add(new TextMimeType("text/plain", encoding));
282 }
283
284 if(mimeTypes.isEmpty() || handlers.isEmpty()) {
285 // Nothing to handle
286 return mimeTypes;
287 }
288
289 // String will be passed in as is currently in the encoding defined by encoding
290 try {
291 int lengthBOM = EncodingGuesser.getLengthBOM(encoding, data);
292 String content = new String(EncodingGuesser.getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding);
293 return fireMimeHandlers(mimeTypes, content);
294 }catch(UnsupportedEncodingException ignore) {
295 // This should never, never, never happen
296 }
297 return mimeTypes;
298 }
299
300 /**
301 * Change the list of preferred encodings.
302 * This list is used where multiple possible encodings are identified to refer to
303 * the contents in a byte array passed in or read in from a Stream or File object.
304 *
305 * This list is iterated over in order and the first match is set as the encoding for
306 * the text/plain TextMimeType ONLY if the JVM default encoding is not in the list.
307 *
308 * If the neither the defaultEncoding or any of these preferred encodings are in
309 * the list of possible encodings then the first possible encoding will be used.
310 *
311 * @param encodings String array of canonical encoding names.
312 */
313 public static void setPreferredEncodings(String [] encodings) {
314 TextMimeDetector.preferredEncodings = EncodingGuesser.getValidEncodings(encodings);
315 if(log.isDebugEnabled()) {
316 log.debug("Preferred Encodings set to " + TextMimeDetector.preferredEncodings);
317 }
318 }
319
320 /**
321 * Register a TexMimeHandler(s)
322 * @param handler to register
323 */
324 public static void registerTextMimeHandler(TextMimeHandler handler) {
325 handlers.add(handler);
326 }
327
328 /**
329 * Unregister a TextMimeHandler
330 * @param handler to unregister
331 */
332 public static void unregisterTextMimeHandler(TextMimeHandler handler) {
333 handlers.remove(handler);
334 }
335
336 /**
337 * Get the current Collection of registered TexMimeHandler(s)
338 * @return currently registered collection of TextMimeHandler(s)
339 */
340 public static Collection getRegisteredTextMimeHandlers() {
341 return handlers;
342 }
343
344 /**
345 * Give registered TextMimeHandler(s) the opportunity to influence the
346 * actual mime type before returning from the getMimeTypesXXX(...) methods
347 * @param mimeTypes
348 * @param content
349 * @return
350 */
351 private Collection fireMimeHandlers(Collection mimeTypes, String content) {
352 // We only have one entry in the mimeTypes Collection due to the way
353 // this MimeDetector works.
354 TextMimeType mimeType = (TextMimeType)mimeTypes.iterator().next();
355
356 for(Iterator it = handlers.iterator(); it.hasNext(); ) {
357 TextMimeHandler tmh = (TextMimeHandler)it.next();
358 if(tmh.handle(mimeType, content)) {
359 // The first handler to return true will short circuit the rest of the handlers
360 break;
361 }
362 }
363 return mimeTypes;
364 }
365
366 /*
367 * This is a quick check for the byte array to see if it contains binary data.
368 *
369 * As no known text encoding can have more than MAX_NULL_VALUES consecutive null values the
370 * method does a quick and dirty elimination of what are probably binary files but should never eliminate possible text files.
371 *
372 * It is possible that some binary files will not have MAX_NULL_VALUES consecutive byte
373 * values especially if it's a small file and will slip through here. Later tests should eliminate these.
374 *
375 * We will modify this method to include other known sequences as and when we discover them
376 */
377 private boolean isBinary(byte [] data) {
378
379 int negCount = 0;
380
381 for(int i = 0; i < data.length; i++) {
382 if(data[i] == 0) {
383 negCount++;
384 } else {
385 negCount = 0;
386 }
387 if(negCount == MAX_NULL_VALUES) {
388 return true;
389 }
390 }
391 return false;
392 }
393 }
394
395