View Javadoc

1   /*
2    * Copyright 2007-2009 Medsea Business Solutions S.L.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package eu.medsea.mimeutil.detector;
17  
18  import java.io.BufferedInputStream;
19  import java.io.BufferedReader;
20  import java.io.File;
21  import java.io.FileInputStream;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.io.Reader;
27  import java.net.URL;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Collection;
31  import java.util.Collections;
32  import java.util.Enumeration;
33  import java.util.Iterator;
34  import java.util.LinkedHashSet;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.regex.Pattern;
38  
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  
42  import eu.medsea.mimeutil.MimeException;
43  import eu.medsea.mimeutil.MimeUtil;
44  
45  /**
46   * The magic mime rules files are loaded in the following way.
47   * <ol>
48   * <li>From a JVM system property <code>magic-mime</code> i.e
49   * <code>-Dmagic-mime=../my/magic/mime/rules</code></li>
50   * <li>From any file named <code>magic.mime</code> that can be found on the
51   * classpath</li>
52   * <li>From a file named <code>.magic.mime</code> in the users home directory</li>
53   * <li>From the normal Unix locations <code>/usr/share/file/magic.mime</code>
54   * and <code>/etc/magic.mime</code> (in that order)</li>
55   * <li>From the internal <code>magic.mime</code> file
56   * <code>eu.medsea.mimeutil.magic.mime</code> if, and only if, no files are
57   * located in step 4 above.</li>
58   * </ol>
59   * Each rule file is appended to the end of the existing rules so the earlier in
60   * the sequence you define a rule means this will take precedence over rules
61   * loaded later. </p>
62   * <p>
63   * You can add new mime mapping rules using the syntax defined for the Unix
64   * magic.mime file by placing these rules in any of the files or locations
65   * listed above. You can also change an existing mapping rule by redefining the
66   * existing rule in one of the files listed above. This is handy for some of the
67   * more sketchy rules defined in the existing Unix magic.mime files.
68   * <p>
69   * We extended the string type rule which allows you to match strings in a file
70   * where you do not know the actual offset of the string containing magic file
71   * information it goes something like “what I am looking for will be ‘somewhere’
72   * within the next n characters” from this location. This is an important
73   * improvement to the string matching rules especially for text based documents
74   * such as HTML and XML formats. The reasoning for this was that the rules for
75   * matching SVG images defined in the original 'magic.mime' file hardly ever
76   * worked, this is because of the fixed offset definitions within the magic rule
77   * format. As XML documents generally have an XML declaration that can contain
78   * various optional attributes the length of this header often cannot be
79   * determined, therefore we cannot know that the DOCTYPE declaration for an SVG
80   * xml file starts at “this” location, all we can say is that, if this is an SVG
81   * xml file then it will have an SVG DOCTYPE somewhere near the beginning of the
82   * file and probably within the first 1024 characters. So we test for the xml
83   * declaration and then we test for the DOCTYPE within a specified number of
84   * characters and if found then we match this rule. This extension can be used
85   * to better identify ALL of the XML type mime mappings in the current
86   * 'magic.mime' file. Remember though, as we stated earlier mime type matching
87   * using any of the mechanisms supported is not an exact science and should
88   * always be viewed as a 'best guess' and not as a 'definite match'.
89   * </p>
90   * <p>
91   * An example of overriding the PNG and SVG rules can be found in our internal
92   * 'magic.mime' file located in the test_files directory (this file is NOT used
93   * when locating rules and is used for testing purposes only). This PNG rule
94   * overrides the original PNG rule defined in the 'magic.mime' file we took from
95   * the Internet, and the SVG rule overrides the SVG detection also defined in
96   * the original 'magic.mime' file
97   * </p>
98   * <p>
99   *
100  * <pre>
101  * #PNG Image Format
102  * 0		string		\211PNG\r\n\032\n		image/png
103  *
104  * #SVG Image Format
105  * #	We know its an XML file so it should start with an XML declaration.
106  * 0	string	\&lt;?xml\ version=	text/xml
107  * #	As the XML declaration in an XML file can be short or extended we cannot know
108  * #	exactly where the declaration ends i.e. how long it is,
109  * #	also it could be terminated by a new line(s) or a space(s).
110  * #	So the next line states that somewhere after the 15th character position we should find the DOCTYPE declaration.
111  * #	This DOCTYPE declaration should be within 1024 characters from the 15th character
112  * &gt;15	string&gt;1024&lt;	\&lt;!DOCTYPE\ svg\ PUBLIC\ &quot;-//W3C//DTD\ SVG 	image/svg+xml
113  * </pre>
114  *
115  * </p>
116  * <p>
117  * As you can see the extension is defined using the syntax string>bufsize<. It
118  * can only be used on a string type and basically means match this within
119  * bufsize character from the position defined at the beginning of the line.
120  * This rule is much more verbose than required as we really only need to check
121  * for the presence of SVG. As we said earlier, this is a test case file and not
122  * used by the utility under normal circumstances.
123  *
124  * The test mime-types.properties and magic.mime files we use can be located in
125  * the test_files directory of this distribution.
126  * </p>
127  * <p>
128  * We use the <code>application/directory</code> mime type to identify
129  * directories. Even though this is not an official mime type it seems to be
130  * well accepted on the net as an unofficial mime type so we thought it was OK
131  * for us to use as well.
132  * </p>
133  * <p>
134  * This class is auto loaded by MimeUtil as it has an entry in the file called
135  * MimeDetectors. MimeUtil reads this file at startup and calls Class.forName()
136  * on each entry found. This mean the MimeDetector must have a no arg
137  * constructor.
138  * </p>
139  *
140  * @author Steven McArdle.
141  *
142  */
143 public class MagicMimeMimeDetector extends MimeDetector {
144 
145 	private static Logger log = LoggerFactory.getLogger(MagicMimeMimeDetector.class);
146 
147 	// Having the defaultLocations as protected allows you to subclass this class
148 	// and add different paths or remove them all so that the internal file is always used
149 	protected static String[] defaultLocations = { "/usr/share/mimelnk/magic",
150 			"/usr/share/file/magic.mime", "/etc/magic.mime" };
151 	private static List magicMimeFileLocations = Arrays
152 			.asList(defaultLocations);
153 
154 	private static ArrayList mMagicMimeEntries = new ArrayList();
155 
156 	public MagicMimeMimeDetector() {
157 		MagicMimeMimeDetector.initMagicRules();
158 	}
159 
160 	public String getDescription() {
161 		return "Get the mime types of files or streams using the Unix file(5) magic.mime files";
162 	}
163 
164 	/**
165 	 * Get the mime types that may be contained in the data array.
166 	 *
167 	 * @param data. The byte array that contains data we want to detect mime types from.
168 	 * @return the mime types.
169 	 * @throws MimeException if for instance we try to match beyond the end of the data.
170 	 */
171 	public Collection getMimeTypesByteArray(final byte[] data)
172 			throws UnsupportedOperationException {
173 		Collection mimeTypes = new LinkedHashSet();
174 		int len = mMagicMimeEntries.size();
175 		try {
176 			for (int i = 0; i < len; i++) {
177 				MagicMimeEntry me = (MagicMimeEntry) mMagicMimeEntries.get(i);
178 				MagicMimeEntry matchingMagicMimeEntry = me.getMatch(data);
179 				if (matchingMagicMimeEntry != null) {
180 					mimeTypes.add(matchingMagicMimeEntry.getMimeType());
181 				}
182 			}
183 		} catch (Exception e) {
184 			log.error(e.getMessage(), e);
185 		}
186 		return mimeTypes;
187 	}
188 
189 
190 	/**
191 	 * Get the mime types of the data in the specified {@link InputStream}.
192 	 * Therefore, the <code>InputStream</code> must support mark and reset (see
193 	 * {@link InputStream#markSupported()}). If it does not support mark and
194 	 * reset, an {@link MimeException} is thrown.
195 	 *
196 	 * @param in
197 	 *            the stream from which to read the data.
198 	 * @return the mime types.
199 	 * @throws MimeException
200 	 *             if the specified <code>InputStream</code> does not support
201 	 *             mark and reset (see {@link InputStream#markSupported()}).
202 	 */
203 	public Collection getMimeTypesInputStream(final InputStream in)
204 			throws UnsupportedOperationException {
205 		Collection mimeTypes = new LinkedHashSet();
206 		int len = mMagicMimeEntries.size();
207 		try {
208 			for (int i = 0; i < len; i++) {
209 				MagicMimeEntry me = (MagicMimeEntry) mMagicMimeEntries.get(i);
210 				MagicMimeEntry matchingMagicMimeEntry = me.getMatch(in);
211 				if (matchingMagicMimeEntry != null) {
212 					mimeTypes.add(matchingMagicMimeEntry.getMimeType());
213 				}
214 			}
215 		} catch (Exception e) {
216 			log.error(e.getMessage(), e);
217 		}
218 		return mimeTypes;
219 	}
220 
221 	/**
222 	 * Defer this call to the File method
223 	 */
224 	public Collection getMimeTypesFileName(final String fileName) throws UnsupportedOperationException {
225 		return getMimeTypesFile(new File(fileName));
226 	}
227 
228 
229 	/**
230 	 * Defer this call to the InputStream method
231 	 */
232 	public Collection getMimeTypesURL(final URL url) throws UnsupportedOperationException {
233 		InputStream in = null;
234 		try {
235 			return getMimeTypesInputStream(in = new BufferedInputStream(MimeUtil.getInputStreamForURL(url)));
236 		}catch(Exception e) {
237 			throw new MimeException(e);
238 		}finally {
239 			closeStream(in);
240 		}
241 	}
242 
243 	/**
244 	 * Defer this call to the InputStream method
245 	 */
246 	public Collection getMimeTypesFile(final File file) throws UnsupportedOperationException {
247 		InputStream in = null;
248 		try {
249 			return getMimeTypesInputStream(in = new BufferedInputStream(new FileInputStream(file)));
250 		}catch(FileNotFoundException e) {
251 			throw new UnsupportedOperationException(e.getLocalizedMessage());
252 		}catch(Exception e) {
253 			throw new MimeException(e);
254 		}finally {
255 			closeStream(in);
256 		}
257 	}
258 
259 	/*
260 	 * This loads the magic.mime file rules into the internal parse tree in the
261 	 * following order 1. From any magic.mime that can be located on the
262 	 * classpath 2. From any magic.mime file that can be located using the
263 	 * environment variable MAGIC 3. From any magic.mime located in the users
264 	 * home directory ~/.magic.mime file if the MAGIC environment variable is
265 	 * not set 4. From the locations defined in the magicMimeFileLocations and
266 	 * the order defined 5. From the internally defined magic.mime file ONLY if
267 	 * we are unable to locate any of the files in steps 2 - 5 above Thanks go
268 	 * to Simon Pepping for his bug report
269 	 */
270 	private static void initMagicRules() {
271 		InputStream in = null;
272 
273 		// Try to locate a magic.mime file locate by system property magic-mime
274 		try {
275 			String fname = System.getProperty("magic-mime");
276 			if (fname != null && fname.length() != 0) {
277 				in = new FileInputStream(fname);
278 				if (in != null) {
279 					parse("-Dmagic-mime=" + fname, new InputStreamReader(in));
280 				}
281 			}
282 		} catch (Exception e) {
283 			log.error("Failed to parse custom magic mime file defined by system property -Dmagic-mime ["
284 					+ System.getProperty("magic-mime")
285 					+ "]. File will be ignored.", e);
286 		} finally {
287 			in = closeStream(in);
288 		}
289 
290 		// Try to locate a magic.mime file(s) on the classpath
291 
292 		// Get an enumeration of all files on the classpath with this name. They could be in jar files as well
293 		try {
294 			Enumeration en = MimeUtil.class.getClassLoader().getResources("magic.mime");
295 			while(en.hasMoreElements()) {
296 				URL url = (URL)en.nextElement();
297 				in = url.openStream();
298 				if(in != null) {
299 					try {
300 						parse("classpath:[" + url + "]", new InputStreamReader(in));
301 					} catch(Exception ex) {
302 						log.error("Failed to parse magic.mime rule file [" + url + "] on the classpath. File will be ignored.",
303 							ex);
304 					}
305 				}
306 
307 			}
308 		}catch(Exception e) {
309 			log.error("Problem while processing magic.mime files from classpath. Files will be ignored.", e);
310 		} finally {
311 			in = closeStream(in);
312 		}
313 
314 		// Now lets see if we have one in the users home directory. This is
315 		// named .magic.mime as opposed to magic.mime
316 		try {
317 			File f = new File(System.getProperty("user.home") + File.separator
318 					+ ".magic.mime");
319 			if (f.exists()) {
320 				in = new FileInputStream(f);
321 				if (in != null) {
322 					try {
323 						parse(f.getAbsolutePath(), new InputStreamReader(in));
324 					} catch(Exception ex) {
325 						log.error("Failed to parse .magic.mime file from the users home directory. File will be ignored.", ex);
326 					}
327 				}
328 			}
329 		}catch(Exception e) {
330 			log.error("Problem while processing .magic.mime file from the users home directory. File will be ignored.", e);
331 		} finally {
332 			in = closeStream(in);
333 		}
334 
335 		// Now lets see if we have an environment variable named MAGIC set. This
336 		// would normally point to a magic or magic.mgc file.
337 		// As we don't use these file types we will look to see if there is also
338 		// a magic.mime file at this location for us to use.
339 		try {
340 			String name = System.getProperty("MAGIC");
341 			if (name != null && name.length() != 0) {
342 				// Strip the .mgc from the end if it's there and add the .mime
343 				// extension
344 				if (name.indexOf('.') < 0) {
345 					name = name + ".mime";
346 				} else {
347 					// remove the mgc extension
348 					name = name.substring(0, name.indexOf('.') - 1) + "mime";
349 				}
350 				File f = new File(name);
351 				if (f.exists()) {
352 					in = new FileInputStream(f);
353 					if (in != null) {
354 						try {
355 							parse(f.getAbsolutePath(),
356 									new InputStreamReader(in));
357 						}catch(Exception ex) {
358 							log.error("Failed to parse magic.mime file from directory located by environment variable MAGIC. File will be ignored.", ex);
359 						}
360 					}
361 				}
362 			}
363 		} catch (Exception e) {
364 			log.error("Problem while processing magic.mime file from directory located by environment variable MAGIC. File will be ignored.", e);
365 		} finally {
366 			in = closeStream(in);
367 		}
368 
369 		// Parse the UNIX magic(5) magic.mime files. Since there can be
370 		// multiple, we have to load all of them.
371 		// We save, how many entries we have now, in order to fall back to our
372 		// default magic.mime that we ship,
373 		// if no entries were read from the OS.
374 
375 		int mMagicMimeEntriesSizeBeforeReadingOS = mMagicMimeEntries.size();
376 		Iterator it = magicMimeFileLocations.iterator();
377 		while (it.hasNext()) {
378 			parseMagicMimeFileLocation((String) it.next());
379 		}
380 
381 		if (mMagicMimeEntriesSizeBeforeReadingOS == mMagicMimeEntries.size()) {
382 			// Use the magic.mime that we ship
383 			try {
384 				String resource = "eu/medsea/mimeutil/magic.mime";
385 				in = MimeUtil.class.getClassLoader().getResourceAsStream(
386 						resource);
387 				if(in != null) {
388 					try {
389 						parse("resource:" + resource, new InputStreamReader(in));
390 					}catch(Exception ex) {
391 						log.error("Failed to parse internal magic.mime file.", ex);
392 					}
393 				}
394 			} catch (Exception e) {
395 				log.error("Problem while processing internal magic.mime file.", e);
396 			} finally {
397 				in = closeStream(in);
398 			}
399 		}
400 	}
401 
402 	private static void parseMagicMimeFileLocation(final String location) {
403 		InputStream is = null;
404 
405 		List magicMimeFiles = getMagicFilesFromMagicMimeFileLocation(location);
406 
407 		for (Iterator itFile = magicMimeFiles.iterator(); itFile.hasNext();) {
408 			File f = (File) itFile.next();
409 			try {
410 				if (f.exists()) {
411 					is = new FileInputStream(f);
412 					try {
413 						parse(f.getAbsolutePath(), new InputStreamReader(is));
414 					}catch(Exception e) {
415 						log.error("Failed to parse " + f.getName() + ". File will be ignored.");
416 					}
417 				}
418 			} catch (Exception e) {
419 				log.error(e.getMessage(), e);
420 			} finally {
421 				is = closeStream(is);
422 			}
423 		}
424 	}
425 
426 	private static List getMagicFilesFromMagicMimeFileLocation(
427 			final String magicMimeFileLocation) {
428 		List magicMimeFiles = new LinkedList();
429 		if (magicMimeFileLocation.indexOf('*') < 0) {
430 			magicMimeFiles.add(new File(magicMimeFileLocation));
431 		} else {
432 			int lastSlashPos = magicMimeFileLocation.lastIndexOf('/');
433 			File dir;
434 			String fileNameSimplePattern;
435 			if (lastSlashPos < 0) {
436 				dir = new File("someProbablyNotExistingFile").getAbsoluteFile()
437 						.getParentFile();
438 				fileNameSimplePattern = magicMimeFileLocation;
439 			} else {
440 				String dirName = magicMimeFileLocation.substring(0,
441 						lastSlashPos);
442 				if (dirName.indexOf('*') >= 0)
443 					throw new UnsupportedOperationException(
444 							"The wildcard '*' is not allowed in directory part of the location! Do you want to implement expressions like /path/**/*.mime for recursive search? Please do!");
445 
446 				dir = new File(dirName);
447 				fileNameSimplePattern = magicMimeFileLocation
448 						.substring(lastSlashPos + 1);
449 			}
450 
451 			if (!dir.isDirectory())
452 				return Collections.EMPTY_LIST;
453 
454 			String s = fileNameSimplePattern.replaceAll("\\.", "\\\\.");
455 			s = s.replaceAll("\\*", ".*");
456 			Pattern fileNamePattern = Pattern.compile(s);
457 
458 			File[] files = dir.listFiles();
459 			for (int i = 0; i < files.length; i++) {
460 				File file = files[i];
461 
462 				if (fileNamePattern.matcher(file.getName()).matches())
463 					magicMimeFiles.add(file);
464 			}
465 		}
466 		return magicMimeFiles;
467 	}
468 
469 	// Parse the magic.mime file
470 	private static void parse(final String magicFile, final Reader r)
471 			throws IOException {
472 		long start = System.currentTimeMillis();
473 
474 		BufferedReader br = new BufferedReader(r);
475 		String line;
476 		ArrayList sequence = new ArrayList();
477 
478 		long lineNumber = 0;
479 		line = br.readLine();
480 		if (line != null)
481 			++lineNumber;
482 		while (true) {
483 			if (line == null) {
484 				break;
485 			}
486 			line = line.trim();
487 			if (line.length() == 0 || line.charAt(0) == '#') {
488 				line = br.readLine();
489 				if (line != null)
490 					++lineNumber;
491 				continue;
492 			}
493 			sequence.add(line);
494 
495 			// read the following lines until a line does not begin with '>' or
496 			// EOF
497 			while (true) {
498 				line = br.readLine();
499 				if (line != null)
500 					++lineNumber;
501 				if (line == null) {
502 					addEntry(magicFile, lineNumber, sequence);
503 					sequence.clear();
504 					break;
505 				}
506 				line = line.trim();
507 				if (line.length() == 0 || line.charAt(0) == '#') {
508 					continue;
509 				}
510 				if (line.charAt(0) != '>') {
511 					addEntry(magicFile, lineNumber, sequence);
512 					sequence.clear();
513 					break;
514 				}
515 				sequence.add(line);
516 			}
517 
518 		}
519 		if (!sequence.isEmpty()) {
520 			addEntry(magicFile, lineNumber, sequence);
521 		}
522 
523 		if (log.isDebugEnabled())
524 			log.debug("Parsing \"" + magicFile + "\" took "
525 					+ (System.currentTimeMillis() - start) + " msec.");
526 	}
527 
528 	private static void addEntry(final String magicFile, final long lineNumber,
529 			final ArrayList aStringArray) {
530 		try {
531 			MagicMimeEntry magicEntry = new MagicMimeEntry(aStringArray);
532 			mMagicMimeEntries.add(magicEntry);
533 			// Add this to the list of known mime types as well
534 			if (magicEntry.getMimeType() != null) {
535 				MimeUtil.addKnownMimeType(magicEntry.getMimeType());
536 			}
537 		} catch (InvalidMagicMimeEntryException e) {
538 			// Continue on but lets print an exception so people can see there
539 			// is a problem
540 			log.warn(e.getClass().getName() + ": " + e.getMessage()
541 					+ ": file \"" + magicFile + "\": before or at line "
542 					+ lineNumber, e);
543 		}
544 	}
545 }