#!/bin/bash # v0.1 : basic title extract # v0.2 : added help arguments # v0.3 : added version arguments # v0.3.1: change !title to title in version info # v0.4 : added case insensitive matching # v0.5 : ignore SSL certificate errors # v0.6 : switch to curl, add maximum filesize and time limits # v0.7 : follow HTTP redirects, was broken since switch to curl # v0.7.1: pretend to be Firefox, some broken websites reject cURL # v0.8 : strip newlines and leading whitespace from "broken" titles # : update user agent string # v0.8.1: change name in version string to "blatitle" # v0.9 : add support for gzip compression # v0.9.1: fix support for servers that don't support HTTP HEAD # v0.9.2: tidy up temporary files afterwards # v0.9.3: advertise text/html acceptance as some servers require it # v0.9.4: switch to a cleaner awk script to detect titles # v1.0 : if Content-Type is text/plain, just use the first line as the title # v1.1 : decode HTML in the title using recode (adds recode as a dependency) # v1.2 : deal with some cURL exit codes # v1.2.1: decode HTML using PHP instead of recode (adds PHP as a dependency, removes recode) # v1.2.2: do case insensitive header checks # v1.2.3: find the first non-blank test/plain line rather than selecting the literal first line # v1.3 : add support for Gopher (take first line as title, like text/plain) # v1.3.1: add a specific interface for cURL to use # v1.4 : set Accept-Language to en-GB # v1.4.1: claim a more recent Firefox in the user agent string to make Twitter send the real page title # v1.5 : add a quirks mode, starting with claiming to be cURL for YouTube # v1.5.1: disable Bash globbing # v1.6 : handle multiple title tags, only spoof IP if in quirks mode, add www.ispreview.co.uk to quirks mode # v1.6.1: change user agent string to latest Firefox # : add Twitter to quirks mode since Twitter no longer sends a to even the latest Firefox user agent string # v1.7 : add support for Open Graph titles # v1.8 : add special extra quirks mode for Twitter because it doesn't seem to resemble a website in any way # v1.8.1: don't bind to a specific interface/IP # Disable globbing set -f OPTS='--location --insecure --silent --max-filesize 1048576 --max-time 10 -A "Mozilla/5.0 (X11; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0" -H "Accept-Encoding: gzip" -H "Accept: text/html" -H "Accept-Language: en-GB,en;q=0.5"' read url # Quirks # Claim to be cURL if echo "$url" | grep -Eq "^http(s|)://(m.|www.|)(youtube\.com|youtu\.be)/|^http(s|)://www\.ispreview\.co\.uk/|^http(s|)://(www.|)twitter\.com/" ; then OPTS='--location --insecure --silent --max-filesize 1048576 --max-time 10 -H "Accept-Encoding: gzip" -H "Accept: text/html" -H "Accept-Language: en-GB,en;q=0.5"' fi # Use Twitter mobile site if echo "$url" | grep -Eq "^http(s|)://(www.|)twitter\.com/" ; then url=$(echo "$url" | sed 's|/www\.twitter\.com/|/mobile.twitter.com/|') url=$(echo "$url" | sed 's|/twitter\.com/|/mobile.twitter.com/|') quirk="twitter" fi TIME=$(date +%s%N) if [ -z "$TIME" ] then echo "Timestamp error" exit 1 fi if [ "$url" == "-h" ] || [ "$url" == "--help" ] then echo "Usage: '!title <url>' where <url> is an http:// or https:// URL" elif [ "$url" == "-v" ] || [ "$url" == "--version" ] then echo "blatitle version 1.8" else eval curl $OPTS -D /tmp/$TIME.header '$url' -o /tmp/$TIME.body.gz ret=$? if [ "$ret" -eq 28 ] then echo "Connection to host timed out!" exit elif [ "$ret" -ne 0 ] then echo "cURL error $ret when fetching the page." exit fi grep -qia "Content-Encoding: gzip" /tmp/$TIME.header && gunzip /tmp/$TIME.body.gz || mv /tmp/$TIME.body.gz /tmp/$TIME.body grep -qia "Content-Type: text/plain" /tmp/$TIME.header && TYPE="text/plain" if [[ "$TYPE" == "text/plain" ]] || [[ "$url" =~ ^gopher:// ]] then # text/plain or gopher:// URL, just use the first line grep -a -m1 . /tmp/$TIME.body elif [[ "$quirk" = "twitter" ]] then # Extract the Twitter title grep -a -A1 tweet-text /tmp/$TIME.body | head -n2 | tail -n1 | perl -pe 's|^.*?>||' | perl -pe 's|<.*?>||g' | php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES|ENT_HTML401);' | sed -r 's/^\s+// ; s/\s+$//' else # Probably HTML # Check for Open Graph og:title first xmllint --xpath 'string(/meta/@content)' <(grep -a -Em1 '<.*meta.*property.*og:title' /tmp/$TIME.body) 2> /dev/null # None found, look for HTML <title> if [ "$?" -ne 0 ] ; then cat /tmp/$TIME.body | tr '\n' ' ' | tr '\r' ' ' | grep -oiE '<title[^>]*>([^<]+)' | head -1 | sed -r 's/]*//I; s/^>//I; s/<\/title>$//I' | php -r 'while(($line=fgets(STDIN)) !== FALSE) echo html_entity_decode($line, ENT_QUOTES|ENT_HTML401);' fi fi rm /tmp/$TIME.header /tmp/$TIME.body fi