Skip to content

Commit

Permalink
Add MNetPage spider (#80)
Browse files Browse the repository at this point in the history
* Add spider for crawling muusikoiden.net web forum messages

 Committer: bonskotti <[email protected]>

* Make MNetPageItem use FollowAndParse

* Conform to latest flake8 settings

* Update documentation

---------

Co-authored-by: bonskotti <[email protected]>
  • Loading branch information
jmyrberg and bonskotti authored Sep 10, 2023
1 parent 78861b7 commit 049f097
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 70 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
The library provides an easy-to-use API for fetching data from various Finnish websites:

| Website | Type | Spider API class |
| -------------------------------------------------------------- | ----------------- | ------------------ |
|----------------------------------------------------------------|-------------------|--------------------|
| [Ilta-Sanomat](https://www.is.fi) | News article | `ISArticle` |
| [Iltalehti](https://www.il.fi) | News article | `ILArticle` |
| [YLE Uutiset](https://www.yle.fi/uutiset) | News article | `YLEArticle` |
| [Suomi24](https://keskustelu.suomi24.fi) | Discussion thread | `Suomi24Page` |
| [Muusikoiden.net](https://www.muusikoiden.net) | Discussion thread | `MNetPage` |
| [Vauva](https://www.vauva.fi) | Discussion thread | `VauvaPage` |
| [Oikotie Asunnot](https://asunnot.oikotie.fi/myytavat-asunnot) | Apartment ad | `OikotieApartment` |
| [Tori](https://www.tori.fi) | Item deal | `ToriDeal` |
Expand Down
1 change: 1 addition & 0 deletions SPIDERS
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ilarticle
isarticle
mnetpage
oikotieapartment
suomi24page
torideal
Expand Down
8 changes: 8 additions & 0 deletions docs/source/finscraper.scrapy_spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ finscraper.scrapy\_spiders.mixins module
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.mnetpage module
------------------------------------------

.. automodule:: finscraper.scrapy_spiders.mnetpage
:members:
:undoc-members:
:show-inheritance:

finscraper.scrapy\_spiders.oikotieapartment module
--------------------------------------------------

Expand Down
154 changes: 154 additions & 0 deletions finscraper/scrapy_spiders/mnetpage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""Module for Muuskoiden.net forum spider."""

import time

from scrapy import Item, Field, Selector
from scrapy.crawler import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader

from finscraper.scrapy_spiders.mixins import FollowAndParseItemMixin
from itemloaders.processors import TakeFirst, Identity, Compose
from finscraper.text_utils import strip_join, strip_elements, \
drop_empty_elements


class _MNetPageSpider(FollowAndParseItemMixin, Spider):
name = 'mnetpage'
start_urls = ['https://muusikoiden.net/keskustelu/conferences.php']
follow_link_extractor = LinkExtractor(
allow_domains=('muusikoiden.net'),
allow=(r'keskustelu\/\d+'),
deny=(r'posts.php?.+'),
deny_domains=(),
canonicalize=True
)
item_link_extractor = LinkExtractor(
allow_domains=('muusikoiden.net'),
allow=(r'keskustelu\/posts.php?.+'),
deny=(),
deny_domains=(),
canonicalize=True
)
custom_settings = {}

def __init__(self, *args, **kwargs):
"""Fetch threads from muusikoiden.net discussions.
Args:
"""
super(_MNetPageSpider, self).__init__(*args, **kwargs)

def _parse_message(self, message: Selector, author: str, time_posted: str):
"""Parse message on thread page.
Args:
message (Selector): Selector for message
author (str): Name of the message author
time (str): Time of the message
Returns:
ItemLoader: Item with data about the message
"""
il = ItemLoader(item=_MNetMessageItem(), selector=message)
il.add_value('author', author)
il.add_value('time_posted', time_posted)
il.add_xpath('quotes', '//i[@class="quote"]//text()')
il.add_xpath('content', '//font[@class="msg"]/text()[not(self::i)]')

return il.load_item()

def _parse_item(self, resp):
"""Parse thread page.
Args:
resp (HtmlResponse): Response from fetching the thread
Returns:
_MNetPageItem: _description_
"""
il = ItemLoader(item=_MNetPageItem(), response=resp)
il.add_value('url', resp.url)
il.add_value('time', int(time.time()))
il.add_xpath('title', '//b[contains(@class, "linkcolor")]//text()')
il.add_xpath('page_number', '(//b[@class="selected_page"])[1]//text()')

messages = []
message_authors = resp.xpath('//a[@class="keskustelu_nick"]//text()')
message_timestamps = resp.xpath('//font[@class="light"]//text()')

# parse messages in thread
for i, message in enumerate(resp.xpath('//font[@class="msg"]')):
messages.append(
self._parse_message(
message=Selector(text=message.get()),
author=message_authors[i].get(),
time_posted=message_timestamps[i].get(),
)
)

# add messages into thread item
il.add_value('messages', messages)

return il.load_item()


class _MNetMessageItem(Item):
"""
Returned message fields:
* author (str): Author of the message.
* time_posted (str): Publish time of the message.
* quotes (list of str): List of quotes in the message.
* content (str): Contents of the message.
"""
author = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
time_posted = Field(
input_processor=strip_join,
output_processor=Compose(strip_elements, TakeFirst())
)
quotes = Field(
input_processor=drop_empty_elements,
output_processor=Identity()
)
content = Field(
input_processor=strip_join,
output_processor=TakeFirst()
)
page_number = Field(
input_processor=Identity(),
output_processor=Identity()
)


class _MNetPageItem(Item):
__doc__ = """
Returned page fields:
* url (str): Url of the thread page.
* time (int): UNIX timestamp of the scraping.
* title (str): Title of the thread.
* page_number (int): Number of the page in the thread.
* messages (list of str): Messages on the thread page.
""" + _MNetMessageItem.__doc__
url = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
time = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
title = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
page_number = Field(
input_processor=Identity(),
output_processor=TakeFirst()
)
messages = Field(
input_processor=Identity(),
output_processor=Identity()
)
59 changes: 41 additions & 18 deletions finscraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
_VauvaPageItem
from finscraper.scrapy_spiders.ylearticle import _YLEArticleSpider, \
_YLEArticleItem
from finscraper.scrapy_spiders.oikotieapartment import \
_OikotieApartmentSpider, _OikotieApartmentItem
from finscraper.scrapy_spiders.torideal import _ToriDealSpider, \
_ToriDealItem


__wrapper_doc__ = '''
from finscraper.scrapy_spiders.oikotieapartment import (
_OikotieApartmentSpider,
_OikotieApartmentItem,
)
from finscraper.scrapy_spiders.torideal import _ToriDealSpider, _ToriDealItem
from finscraper.scrapy_spiders.mnetpage import _MNetPageSpider, _MNetPageItem

__wrapper_doc__ = """
jobdir (str or None, optional): Working directory of the spider.
Defaults to None, which creates a temp directory to be used.
Note that this directory will only be deleted through the ``clear`` method!
Expand All @@ -33,13 +34,15 @@
.. note::
This parameter can be overridden through Scrapy ``settings``
(LOG_LEVEL, LOG_ENABLED) within the ``scrape`` -method.
'''
"""


def _get_docstring(spider_cls, item_cls):
return (spider_cls.__init__.__doc__.strip() +
indent(__wrapper_doc__, ' ' * 12) +
indent(item_cls.__doc__, ' ' * 4))
return (
spider_cls.__init__.__doc__.strip() +
indent(__wrapper_doc__, " " * 12) +
indent(item_cls.__doc__, " " * 4)
)


class ISArticle(_SpiderWrapper):
Expand All @@ -51,7 +54,8 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class ILArticle(_SpiderWrapper):
Expand All @@ -63,7 +67,8 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class YLEArticle(_SpiderWrapper):
Expand All @@ -75,7 +80,8 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class Suomi24Page(_SpiderWrapper):
Expand All @@ -87,7 +93,8 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class VauvaPage(_SpiderWrapper):
Expand All @@ -99,7 +106,8 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class OikotieApartment(_SpiderWrapper):
Expand All @@ -112,7 +120,8 @@ def __init__(self, area=None, jobdir=None, progress_bar=True,
spider_params=dict(area=area),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class ToriDeal(_SpiderWrapper):
Expand All @@ -124,4 +133,18 @@ def __init__(self, jobdir=None, progress_bar=True, log_level=None):
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level)
log_level=log_level,
)


class MNetPage(_SpiderWrapper):
__doc__ = _get_docstring(_MNetPageSpider, _MNetPageItem)

def __init__(self, jobdir=None, progress_bar=True, log_level=None):
super(MNetPage, self).__init__(
spider_cls=_MNetPageSpider,
spider_params=dict(),
jobdir=jobdir,
progress_bar=progress_bar,
log_level=log_level,
)
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ markers =
vauvapage: Mark test as a vauvapage test.
suomi24page: Mark test as a suomi24page test.
oikotieapartment: Mark test as a oikotieapartment test.
torideal: Mark test as a torideal test.
torideal: Mark test as a torideal test.
mnetpage: Mark test as a mnetpage test.
Loading

0 comments on commit 049f097

Please sign in to comment.