Benutzer-Werkzeuge

Webseiten-Werkzeuge


 [[anwenderwiki:mail:mailinglisten.extrahieren]] 

Mailinglisten bei Belwue extrahieren

Problem: Wir hosten Mails bei belwue, legen dort Mailinglisten an und ändern sie. Ich möchte aber auch diese Listen intern „veröffentlichen“. Belwue lässt keinen Export der Daten zu. Abtippen ist zu mühsam.

Lösung: Skript, das sich die Daten bei Belwue über das „administrator“-Konto holt.

Scrapy in ubuntu 12.04

  sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 627220E7
  echo 'deb http://archive.scrapy.org/ubuntu scrapy main' | sudo tee /etc/apt/sources.list.d/scrapy.list
  sudo apt-get update && sudo apt-get install scrapy-0.24
  • Start project:
# scrapy startproject belwue
# cd belwue/
# scrapy genspider maillisten mbox1.belwue.de
  • Later execute at that point:
# scrapy crawl maillisten

Skripte

Benötigte Dateien, die durch „startproject“ entstehen:

./belwue
./belwue/spiders
./belwue/spiders/__init__.py   <- leer
./belwue/spiders/maillisten.py
./belwue/settings.py
./belwue/items.py
./belwue/pipelines.py
./belwue/scrape_and_upload.sh  <- selbst erstellt für upload
./belwue/__init__.py           <- leer
./scrapy.cfg

settings

belwue/settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'belwue'
SPIDER_MODULES = ['belwue.spiders']
NEWSPIDER_MODULE = 'belwue.spiders'
ITEM_PIPELINES = {'belwue.pipelines.BelwuePipeline': 1}
emaildomain = u"@humboldt-ka.de"
admindomain = u"humboldt-gymnasium-karlsruhe.de"
password = "xxxxxxxx"

items

belwue/items.py
# -*- coding: utf-8 -*-                                                    
import scrapy
 
class BelwueItem(scrapy.Item):
    # define the fields for your item here like: 
    # name = scrapy.Field()   
    groupname = scrapy.Field()
    groupprettyname = scrapy.Field()
    groupsize = scrapy.Field()
    grouplink = scrapy.Field()
    members = scrapy.Field()
    pass

pipeline

pipelines.py
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.contrib.exporter import BaseItemExporter
import time
from belwue.settings import admindomain,emaildomain
 
class WikiItemExporter(BaseItemExporter):
    def __init__(self, file, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.output = file
 
    def _write_heading(self, string, output, which=1):
        output.write("="*which+string.encode('utf8')+"="*which+"\n")
 
    def _write_bullet(self, string, output, indent=1):
        output.write("  "*indent+"* "+string.encode('utf8')+"\n")
 
    def _encode_wikilink(self, linkstring, string="Link"):
        return "[["+linkstring+u"|"+string+"]]"
 
 
class MyWikiItemExporter(WikiItemExporter):
    def __init__(self, header, body, **kwargs):
        self._configure(kwargs, dont_fail=True)
        self.body = body
        self.header = header
 
    def start_exporting(self):
        self._write_heading(u"Aktueller Stand der Verteilerlisten", self.header, 6)
        self.header.write(u"**Stand: "+time.strftime('%d.%b %Y')+u"**\n")
        self._write_heading(u"Alle Verteilerlisten", self.header, 5)
 
    def finish_exporting(self):
        pass
 
    def export_item(self, item):
        # print overview to header
        bullet = self._encode_wikilink(item['groupname']+emaildomain, item['groupprettyname'][0])\
            +u": Mitglieder "\
            +item['groupsize']\
            +u" - " +\
            self._encode_wikilink(u"https://mbox1.belwue.de:9010/DomainAdmin/"+admindomain+"/"+item['grouplink'], u"Administratorlink zur Verwaltung")
        self._write_bullet(bullet, self.header)
 
        # print body
        self._write_heading(item['groupprettyname'][0],self.body, 5)
        for member in item['members']: 
            bullet = self._encode_wikilink(member+emaildomain, u"")
            self._write_bullet(bullet, self.body)
        self.body.write("\n")
 
class BelwuePipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
 
    def spider_opened(self, spider):
        self.header = open('heading.txt', 'w+b')
        self.body = open('body.txt','w+b')
        self.exporter = MyWikiItemExporter(self.header, self.body)
        self.exporter.start_exporting()
 
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.header.close()
        self.body.close()
 
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

spider

spiders/maillisten.py
# -*- coding: utf-8 -*-     
  from scrapy import Spider
from belwue.items import BelwueItem
from scrapy.selector import Selector
from scrapy.http import Request
from urlparse import urljoin
from belwue.settings import admindomain
from belwue.settings import password
 
class MaillistenSpider(Spider):
    name = "maillisten"
    allowed_domains = ["mbox1.belwue.de"]
    http_user = u"admin"
    http_pass = password
    start_urls = [
        u"https://mbox1.belwue.de:9010/DomainAdmin/"+admindomain+u"/ObjectList.html?InCluster=1&domainName="+admindomain+u"&"
    ]
 
    def parse(self, response):
        #filename = response.url.split("/")[-2]
        #with open(filename, 'wb') as f:
        #    f.write(response.body)
        items = []
        links = []
        hxs = Selector(response)
 
        xpath = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]'
 
        xpathlink = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/@href'
        links = hxs.xpath(xpathlink).extract()
        xpathname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td/a/text()'
        names = hxs.xpath(xpathname).extract()
        xpathsize = '/descendant-or-self::node()/child::table[attribute::class="settingsBox"]/child::tr[position()=3]/child::td/child::table/child::tr[td="Gruppe"]/td[position()=3]/text()'
        sizes = hxs.xpath(xpathsize).extract()
 
        for i in range(0,len(names)):
#        for i in range(0,1):
            # gefaehrlich, wenn irgendwo links, names oder sizes nicht richtig abgerufen werden -> ungleiche Groessen
            item = BelwueItem()
            item['grouplink'] = links[i]
            item['groupname'] = names[i]
            item['groupsize'] = sizes[i]
            items.append(item)
            #yield item
            yield Request(urljoin(response.url, links[i]), meta={'item': item}, callback=self.parse_job)
 
    def parse_job(self, response):
        hxs = Selector(response)
        item = response.request.meta['item']
        # table.settingsBox:nth-child(5) > tbody:nth-child(2) > tr:nth-child(1) > td:nth-child(1) > input:nth-child(1)
        xpathmemb = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and child::caption="Mitglieder"]/child::tr/child::td/child::input[string-length(@value)!=0]/@value'
        item['members'] = hxs.xpath(xpathmemb).extract()
        # table.settingsBox:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1)
        xpathprettyname = '/descendant-or-self::node()/child::table[attribute::class="settingsBox" and position()=1]/tr[position()=1]/td[position()=2]/input/@value'
        item['groupprettyname']= hxs.xpath(xpathprettyname).extract()
        yield item

deployment config

scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
 
[settings]
default = belwue.settings
 
[deploy]
#url = http://localhost:6800/
project = belwue

upload skript

belwue/scrape_and_upload.sh
#!/bin/bash
 
target="ab12345@pubwww5.belwue.de:/srv/www/virtual/23483/www.humboldt-gymnasium-karlsruhe.de/vhostdata/htdoc/portfolio/data/pages/it-document/mailserver"
key="/home/user/sshkey.rsa"
 
scrapy crawl maillisten
cat heading.txt body.txt > maillisten.txt
 
rsync -a -e "ssh -o ConnectTimeout=5 -i ${key}" maillisten.txt ${target}/verteilerlisten.txt
 
rm heading.txt body.txt maillisten.txt

Linksammlung

Referenz xpath
groking xpath
Recursive crawling
Exporting
 [[anwenderwiki:mail:mailinglisten.extrahieren]] anwenderwiki/mail/mailinglisten.extrahieren.txt · Zuletzt geändert: 2014/08/15 14:06 von 127.0.0.1