Warning: file_put_contents(): Only 0 of 10 bytes written, possibly out of free disk space in /var/web/h/heller.christian/htdocs/www.plomlompom.de/PlomWiki/plomwiki.php on line 287
ArchiveTweets
PlomWiki: Zur Start-Seite Suche Letzte Änderungen (Feed) Letzte Kommentare (Feed)
Impressum Datenschutz-Erklärung

ArchiveTweets

Ansicht Bearbeiten Anzeige-Titel setzen Versions-Geschichte Seiten-Passwort setzen AutoLink-Anzeige ein-/ausschalten

Kleines Shell-Skriptchen zum Sichern der letzten ~3200 Tweets (weiter lässt Twitter ja derzeit nicht zurückblicken) in einer Textdatei. Hab mir das Unterfangen so oft mühselig von Hand zusammengepfriemelt, wurde Zeit, dass ich es mir mal automatisiere!

#!/bin/bash 
#  
# Archive a user's last ~3200[*] tweets (or at least their dates, IDs 
# and texts) in a easily readable text file. Dependency: wget. 
#   
# [*] It's actually easily less: Twitter seems to count, but not deliver   
# done and undone re-tweets into the user_timeline.xml?count=200 size.  
 
# Name of user whose tweets are to be retrieved 
user='plomlompom' 
 
# Name of file to store archived tweets in. 
archive_file='ArchivedTweets.txt' 
 
# How often to try wgetting a page until giving up. 
try_max=3 
 
# File name to expect from URL and to write to filesystem. 
filename='user_timeline.xml?screen_name='$user'&count=200&page=0' 
 
# Collect XML of Twitter messages via API. 
cycle=0 
cycles_max=100 
while [ $cycle -lt $cycles_max ] 
do 
  cycle=`expr $cycle + 1` 
 
  # Get URL to retrieve XML / filename to store it in. 
  url='http://api.twitter.com/1/statuses/'$filename 
 
  # Try to wget archived tweets $try_max times. 
  try=0 
  while [ ! -f $filename ] && [ ! $try -eq $try_max ]; do 
    wget $url 
    try=`expr $try + 1` 
  done 
 
  # If unsuccessful, abort script here. 
  if [ ! -f $filename ]; then 
    echo 'Could not retrieve more data, stopped after trying '$try' times.' 
    exit 1 
  fi 
 
  # Rebuild filename by using last tweet ID of previous chunk for maxid.  
  OLDIFS=$IFS 
  IFS=' 
' 
  for line in $(cat $filename); do 
    shortline=`echo $line | cut -c -8` 
    if [ $shortline = '    <id>' ]; then 
      lastID=`echo $line | sed 's/    <id>//' | sed 's/<\/id>//'` 
    fi 
  done 
  IFS=$OLDIFS 
  filename='user_timeline.xml?screen_name='$user'&count=200&max_id='$lastID 
done 
 
# Later to be used to check whether a chunk processed is the first one. 
test_cut='user_timeline.xml?screen_name='$user'&count=200&max_id=' 
lentest=`echo $test_cut | wc -c` 
lentest=`expr $lentest - 1` 
count=0 
 
# Create non-month-differentiated archive files. 
temp='temp' 
for filename in `ls -r1 'user_timeline.xml?screen_name=plomlompom&count=200&'*` 
do 
  archive_chunk='archive_chunk_'$count 
  count=`expr $count + 1` 
 
  # Join status text message lines broken by newlines. 
  text_line='' 
  text_start=0 
  text_end=0 
  OIFS=$IFS 
  IFS=' 
' 
  for line in `cat $filename` 
  do 
    if [ $text_start -eq 0 ] 
    then 
      text_start=`echo $line | grep -E '^    <text>' | wc -l` 
    fi 
    if [ $text_start -eq 1 ] 
    then 
      text_line=$text_line$line 
      if [ $text_end -eq 0 ] 
      then 
        text_end=`echo $line | grep -E '</text>$' | wc -l` 
      fi 
      if [ $text_end -eq 1 ] 
      then 
        echo $text_line >> $temp 
        text_start=0 
        text_end=0 
        text_line='' 
      else 
        text_line=$text_line' ' 
      fi 
    else 
      echo $line >> $temp 
    fi 
  done 
  mv $temp $filename 
 
  # Format XML to simpler style reduced to three single-line fields. 
  cat $filename | \ 
  sed 's/^    <created_at>\([[:print:]]*\)<\/created_at>$/Date: \1/g' | \ 
  sed 's/^    <id>\([[:print:]]*\)<\/id>$/ID: \1/g' | \ 
  sed 's/^    <text>\([[:print:]]*\)<\/text>$/Text: \1/g' | \ 
  sed 's/^  <\/status>/dieserstringkommtsichernievoreinself/g' | \ 
  grep '\(^Date: \)\|\(^ID: \)\|\(^Text: \)\|\(^dieserstringkommtsichernievoreinself\)' | \ 
  sed 's/dieserstringkommtsichernievoreinself//g' > $archive_chunk 
 
  # Eliminate intersecting tweets. 
  filename_cut=`echo $filename | cut -c -$lentest` 
  if [ $filename_cut == $test_cut ] 
  then 
    tail -n +5 $archive_chunk > $temp 
    mv $temp $archive_chunk 
  fi 
 
  # Translate special symbols. 
 
  cat $archive_chunk | \ 
  sed 's/\&amp;/\&/g' | \ 
  sed 's/\&#183;/·/g' | \ 
  sed 's/\&#8222;/„/g' | \ 
  sed 's/\&#8595;/↓/g' | \ 
  sed 's/\&#243;/ó/g' | \ 
  sed 's/\&#359;/ŧ/g' | \ 
  sed 's/\&#9;//g' | \ 
  sed 's/\&#227;/ã/g' | \ 
  sed 's/\&#8216;/‘/g' | \ 
  sed 's/\&#8232;/⁰/g' | \ 
  sed 's/\&#176;/°/g' | \ 
  sed 's/\&#65279;//g' | \ 
  sed 's/\&#173;//g' | \ 
  sed 's/\&gt;/>/g' | \ 
  sed 's/\&#178;/²/g' | \ 
  sed 's/\&#223;/ß/g' | \ 
  sed 's/\&#228;/ä/g' | \ 
  sed 's/\&#246;/ö/g' | \ 
  sed 's/\&#252;/ü/g' | \ 
  sed 's/\&#196;/Ä/g' | \ 
  sed 's/\&#214;/Ö/g' | \ 
  sed 's/\&#220;/Ü/g' | \ 
  sed 's/\&quot;/"/g' | \ 
  sed 's/\&#8364;/€/g' | \ 
  sed 's/\&#167;/§/g' | \ 
  sed 's/\&#180;/´/g' | \ 
  sed 's/\&#8230;/…/g' | \ 
  sed 's/\&#8211;/–/g' | \ 
  sed 's/\&#8212;/—/g' | \ 
  sed 's/\&#8220;/“/g' | \ 
  sed 's/\&#8221;/”/g' | \ 
  sed 's/\&#187;/»/g' | \ 
  sed 's/\&#171;/«/g' | \ 
  sed 's/\&#8217;/’/g' | \ 
  sed 's/\&#233;/é/g' | \ 
  sed 's/\&lt;/</g' | \ 
  sed 's/\&gt;/>/g' | \ 
  sed 's/\&amp;/\&/g' \ 
  > $temp 
  mv $temp $archive_chunk 
 
  # Translate dates into prefered format. 
  cat $archive_chunk | \ 
  sed 's/^\(Date: \)[[:alpha:]]\{3\} /\1/g' | \ 
  sed 's/^\(Date: \)Jan /\101-/g' | \ 
  sed 's/^\(Date: \)Feb /\102-/g' | \ 
  sed 's/^\(Date: \)Mar /\103-/g' | \ 
  sed 's/^\(Date: \)Apr /\104-/g' | \ 
  sed 's/^\(Date: \)May /\105-/g' | \ 
  sed 's/^\(Date: \)Jun /\106-/g' | \ 
  sed 's/^\(Date: \)Jul /\107-/g' | \ 
  sed 's/^\(Date: \)Aug /\108-/g' | \ 
  sed 's/^\(Date: \)Sep /\109-/g' | \ 
  sed 's/^\(Date: \)Oct /\110-/g' | \ 
  sed 's/^\(Date: \)Nov /\111-/g' | \ 
  sed 's/^\(Date: \)Dec /\112-/g' | \ 
  sed 's/^\(Date: [[:print:]]*\) +0000/\1 UTC/g' | \ 
  sed 's/^\(Date: \)\([[:print:]]*\) \([[:digit:]]*\)/\1\3-\2/g' \ 
  > $temp 
  mv $temp $archive_chunk 
 
  cat $archive_chunk >> $archive_file 
 
done 
 
# Clean up. 
rm 'archive_chunk_'* 
rm 'user_timeline.xml?screen_name=plomlompom&count=200&'* 

Kommentare

Keine Kommentare zu dieser Seite.

Schreibe deinen eigenen Kommentar

Kommentar-Schreiben derzeit nicht möglich: Kein Captcha gesetzt.

PlomWiki-Engine lizensiert unter der AGPLv3. Quellcode verfügbar auf GitHub.