summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/installdir/HtFileType
blob: e47d70e1aac787ebd0b5004ca18bc7182aa8f1b2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/sh

# HtContent
#
#     Determine a file's MIME type from its contents.
#
# 

# Part of the ht://Dig package   <http://www.htdig.org/>
# Copyright (c) 2003 The ht://Dig Group
# For copyright details, see the file COPYING in your distribution
# or the GNU Public License version 2 or later
# <http://www.gnu.org/copyleft/gpl.html>

# $Id: HtFileType,v 1.5 2004/06/11 15:55:16 grdetil Exp $

if file -v > /dev/null; then
    have_modern_file=true
else
    have_modern_file=false
fi

tmpfile=`mktemp /tmp/HtFileType.XXXXXX` || exit 1
magic_file=@CONFIG_DIR@/HtFileType-magic.mime
#magic_file=${0}-magic.mime

# Go through each specified file.
# Can't say 'for input in "$*"; do'  as that breaks up names containing spaces
while [ $# -gt 0 ]; do
  input="$1"; shift;

  #echo -n $input '	'

  # Classify based on start of file.  Strip leading whitespace and
  # convert broken "<!" style comments to "<!--".
  # (If available, use a fixed magic file, which is faster...)
  # From the output, strip path name and comments like ', with very long lines'

  if $have_modern_file; then
    output=`file -i -b -m $magic_file "$input" 2>/dev/null | sed 's/[,;].*//'`
  else
      # old file(1) command can't strip leading whitespace, or accept -i option
    head -100 "$input" 2> /dev/null | tr '\012\015' ' ' |
                       sed -e's/^ *//' -e's/ *<! /<!--/' > $tmpfile
    output=`file -m $magic_file $tmpfile 2>/dev/null | sed -e 's/.*:[ 	]*//' -e's/[,;].*//'`
    /bin/rm -f $tmpfile
  fi

  case $output in
	# 'file' calls most human-readable files "text", so check what type
    *text)  case $output in
	    *HTML* ) type=text/html;;
	    *SGML* ) type=text/sgml;;
	    *XML*  ) type=text/xml;;

	    # ignore most source code
	    *script* | *program* | *command* ) type=application/x-unknown;;
	    *pre-processor*|*"'diff' output"*) type=application/x-unknown;;

	    # assume all other text is plain
	    # (Includes outputs "English t.", "ASCII t.", "international t.")
	    *) type=text/plain;;
	    esac;;

	# Our magic file already outputs MIME types, so don't change them
    */* ) type=$output;;

        # Other recognised types
    *postscript* | *PostScript* ) type=application/postscript;;
    *PDF* | *acrobat*   ) type=application/pdf;;

        # otherwise give up
    *) type=application/x-unknown;;
  esac

  # Catch HTML documents, which are special cases of SGML and XML
  case $type in
    text/xml | text/sgml )
      if head -100 "$input"| egrep -i '<html|<head|<title|< *a *href *='>/dev/null
      then
        echo text/html
      else
        echo $type
      fi;;
    * ) echo $type
  esac
done