title.bash


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

#!/bin/bash

# v0.1  : basic title extract
# v0.2  : added help arguments
# v0.3  : added version arguments
# v0.3.1: change !title to title in version info
# v0.4  : added case insensitive matching
# v0.5  : ignore SSL certificate errors
# v0.6  : switch to curl, add maximum filesize and time limits
# v0.7  : follow HTTP redirects, was broken since switch to curl
# v0.7.1: pretend to be Firefox, some broken websites reject cURL
# v0.8  : strip newlines and leading whitespace from "broken" titles
#       : update user agent string
# v0.8.1: change name in version string to "blatitle"
# v0.9  : add support for gzip compression
# v0.9.1: fix support for servers that don't support HTTP HEAD
# v0.9.2: tidy up temporary files afterwards
# v0.9.3: advertise text/html acceptance as some servers require it
# v0.9.4: switch to a cleaner awk script to detect titles
# v1.0  : if Content-Type is text/plain, just use the first line as the title
# v1.1  : decode HTML in the title using recode (adds recode as a dependency)
# v1.2  : deal with some cURL exit codes
# v1.2.1: decode HTML using PHP instead of recode (adds PHP as a dependency, removes recode)
# v1.2.2: do case insensitive header checks
# v1.2.3: find the first non-blank test/plain line rather than selecting the literal first line
# v1.3  : add support for Gopher (take first line as title, like text/plain)
# v1.3.1: add a specific interface for cURL to use
# v1.4  : set Accept-Language to en-GB
# v1.4.1: claim a more recent Firefox in the user agent string to make Twitter send the real page title
# v1.5  : add a quirks mode, starting with claiming to be cURL for YouTube
# v1.5.1: disable Bash globbing
# v1.6  : handle multiple title tags, only spoof IP if in quirks mode, add www.ispreview.co.uk to quirks mode
# v1.6.1: change user agent string to latest Firefox
#       : add Twitter to quirks mode since Twitter no longer sends a <title> to even the latest Firefox user agent string
# v1.7  : add support for Open Graph titles
# v1.8  : add special extra quirks mode for Twitter because it doesn't seem to resemble a website in any way

# Disable globbing
set -f

OPTS='--interface 178.32.55.206 --location --insecure --silent --max-filesize 1048576 --max-time 10 -A "Mozilla/5.0 (X11; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0" -H "Accept-Encoding: gzip" -H "Accept: text/html" -H "Accept-Language: en-GB,en;q=0.5"'

read url

# Quirks
# Claim to be cURL
if echo "$url" | grep -Eq "^http(s|)://(m.|www.|)(youtube\.com|youtu\.be)/|^http(s|)://www\.ispreview\.co\.uk/|^http(s|)://(www.|)twitter\.com/" ; then
  OPTS='--interface 192.168.122.10 --location --insecure --silent --max-filesize 1048576 --max-time 10 -H "Accept-Encoding: gzip" -H "Accept: text/html" -H "Accept-Language: en-GB,en;q=0.5"'
fi
# Use Twitter mobile site
if echo "$url" | grep -Eq "^http(s|)://(www.|)twitter\.com/" ; then
  url=$(echo "$url" | sed 's|/www\.twitter\.com/|/mobile.twitter.com/|')
  url=$(echo "$url" | sed 's|/twitter\.com/|/mobile.twitter.com/|')
  quirk="twitter"
fi

TIME=$(date +%s%N)

if [ -z "$TIME" ]
then
  echo "Timestamp error"
  exit 1
fi

if [ "$url" == "-h" ] || [ "$url" == "--help" ]
then
  echo "Usage: '!title <url>' where <url> is an http:// or https:// URL"
elif [ "$url" == "-v" ] || [ "$url" == "--version" ]
then
  echo "blatitle version 1.8"
else
  eval curl $OPTS -D /tmp/$TIME.header '$url' -o /tmp/$TIME.body.gz
  ret=$?
  if [ "$ret" -eq 28 ]
  then
    echo "Connection to host timed out!"
    exit
  elif [ "$ret" -ne 0 ]
  then
    echo "cURL error $ret when fetching the page."
    exit
  fi
  grep -qia "Content-Encoding: gzip" /tmp/$TIME.header && gunzip /tmp/$TIME.body.gz || mv /tmp/$TIME.body.gz /tmp/$TIME.body
  grep -qia "Content-Type: text/plain" /tmp/$TIME.header && TYPE="text/plain"
  if [[ "$TYPE" == "text/plain" ]] || [[ "$url" =~ ^gopher:// ]]
  then
    # text/plain or gopher:// URL, just use the first line
    grep -a -m1 . /tmp/$TIME.body
  elif [[ "$quirk" = "twitter" ]]
  then
    # Extract the Twitter title
    grep -a -A1 tweet-text /tmp/$TIME.body | head -n2 | tail -n1 | perl -pe 's|^.*?>||' | perl -pe 's|<.*?>||g' | php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES|ENT_HTML401);' | sed -r 's/^\s+// ; s/\s+$//'
  else
    # Probably HTML
    # Check for Open Graph og:title first
    xmllint --xpath 'string(/meta/@content)' <(grep -a -Em1 '<.*meta.*property.*og:title' /tmp/$TIME.body) 2> /dev/null
    # None found, look for HTML <title>
    if [ "$?" -ne 0 ] ; then
      cat /tmp/$TIME.body | tr '\n' ' ' | tr '\r' ' ' | grep -oiE '<title[^>]*>([^<]+)</title>' | head -1 | sed -r 's/<title[^>]*//I; s/^>//I; s/<\/title>$//I' | php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES|ENT_HTML401);'
    fi
  fi
  rm /tmp/$TIME.header /tmp/$TIME.body
fi