Оценок пока нет Cross platform selenium parsing with chrome/chromedriver

Today we will look at an example of parsing posts from social systems in which a keyword is mentioned. For this we will use the service www.social-searcher.com.

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
#from fake_useragent import UserAgent
import os 
import platform 
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
from selenium.webdriver.common.proxy import Proxy, ProxyType



plt = platform.system()
driver_path = None
browser_path = None
if plt == "Windows":
    #http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Win_x64/902856/
    driver_path = '../bin/windows/chromedriver_win32/chromedriver.exe'
    browser_path = '../bin/windows/chrome-win/chrome.exe'
    print(f"Your OS {plt}")
elif plt == "Linux":
    #http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/902856/
    driver_path = '../bin/linux/chromedriver_linux64/chromedriver'
    browser_path = '../bin/linux/chrome-linux/chrome'
    print(f"Your OS {plt}")
elif plt == "Darwin":
    #http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Mac/902856/
    driver_path = '../bin/linux/chromedriver_mac64/chromedriver'
    browser_path = '../bin/linux/chrome-mac/chrome'
    print(f"Your OS {plt}")
else:
    print(f"Unidentified {plt}")

options = webdriver.ChromeOptions()
#
#ua = UserAgent()
#userAgent = ua.random
#options.add_argument(f'user-agent={userAgent}')
#
#options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--test-type')
#options.add_argument('--single-process')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.binary_location = browser_path
browser = webdriver.Chrome(executable_path = driver_path, options=options)
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057", "platform":"Windows"})
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "platform":"Windows"})

browser.set_window_size(1920, 1080)
#browser.set_window_size(1024, 1024)
browser.maximize_window()

baseurls = ["https://www.social-searcher.com"]
methods = ['social-buzz', 'google-social-search']
sites = ['fb', 'tw', 'in']
keyword = "העבודה"
page_to = 10

params = '?q=' + keyword + ''.join(['&' + str(p) + '=on' for p in sites])
url = f'{baseurls[0]}/{methods[1]}/' + params
print(url)

browser.get(url)
time.sleep(2)
browser.save_screenshot("./screenshot.png")

posts = []

def parse_list(browser, social_name, page_num = 1):
    print(f"Parsing {social_name} page #{page_num}")
    html = browser.page_source
    bs = BeautifulSoup(html, 'lxml')
    cards_html = bs.find('div', {'class': 'gsc-expansionArea'})
    cards_array = cards_html.find_all('div', {'class': 'gsc-webResult'})
    for card in cards_array:
        title = card.find('a', {'class': 'gs-title'}).text
        text = card.find('div', {'class': 'gs-snippet'}).text
        url = card.find('div', {'class': 'gs-visibleUrl-long'}).text
        pic=None
        try:
            pic = card.find('img', {'class': 'gs-image'})['src']
        except:
            #print("-")
            pass
        post = {title: title, text: text, url: url, pic: pic}
        posts.append(post)

    time.sleep(3)
    browser.save_screenshot("./screenshot.png")
iframes = browser.find_elements_by_class_name('iframe-column')

for social in browser.find_elements_by_class_name('iframe-column'):
    social_name = social.find_element_by_class_name('network-name').text
    social_iframe = social.find_element_by_tag_name('iframe')
    browser.switch_to.frame(social_iframe)
    #
    parse_list(browser, social_name, page_num = 1)
    for page_num in range(2, page_to + 1, 1):
        page_link = browser.find_element_by_xpath(f"//div[@aria-label='Page {page_num}']")
        browser.execute_script("arguments[0].click();", page_link)
        parse_list(browser, social_name, page_num)
    #
    browser.switch_to.default_content()

print(posts)

browser.quit()

The service offers paid and shareware usage rates. We need a free one. The relevant links are:

Result with keyword search

What does the script do? The script opens a page by a link, inserts a keyword, ticks the boxes to search for the desired social networks and makes parsing page by page in turn

Parsing result in Terminal

Below is an advanced parsing code with autoiteration of keywords

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
#from fake_useragent import UserAgent
import datetime
import os 
import platform 
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
from selenium.webdriver.common.proxy import Proxy, ProxyType



plt = platform.system()
driver_path = None
browser_path = None
if plt == "Windows":
    #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Win_x64/902856/
    driver_path = '../bin/windows/chromedriver_win32/chromedriver.exe'
    browser_path = '../bin/windows/chrome-win/chrome.exe'
    print(f"Your OS {plt}")
elif plt == "Linux":
    #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/902856/
    driver_path = '../bin/linux/chromedriver_linux64/chromedriver'
    browser_path = '../bin/linux/chrome-linux/chrome'
    print(f"Your OS {plt}")
elif plt == "Darwin":
    #You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Mac/902856/
    driver_path = '../bin/linux/chromedriver_mac64/chromedriver'
    browser_path = '../bin/linux/chrome-mac/chrome'
    print(f"Your OS {plt}")
else:
    print(f"Unidentified {plt}")

options = webdriver.ChromeOptions()
#
#ua = UserAgent()
#userAgent = ua.random
#options.add_argument(f'user-agent={userAgent}')
#
#options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--test-type')
#options.add_argument('--single-process')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.binary_location = browser_path
browser = webdriver.Chrome(executable_path = driver_path, options=options)
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057", "platform":"Windows"})
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "platform":"Windows"})

browser.set_window_size(1920, 1080)
browser.maximize_window()


parsing_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
baseurls = ["https://www.social-searcher.com"] #Service for parsing
methods = ['google-social-search', 'social-buzz'] #Subservices/plans for parsing
sites = ['fb', 'tw', 'in'] #Social networks for parsing
keywords = ["מפלגת העבודה", "שרת התחבורה", "מיכאלי", "מירב מיכאלי"] #Keywords
page_to = 10 #Total pages for parsing(max=10)
posts = [] #List for saving parsed posts

def parse_list(browser, social_name, keyword, page_num = 1):
    #Print current page info
    print(f"Parsing {social_name} with keyword @{keyword} on page #{page_num}")
    html = browser.page_source
    bs = BeautifulSoup(html, 'lxml')
    cards_html = bs.find('div', {'class': 'gsc-expansionArea'})
    cards_array = cards_html.find_all('div', {'class': 'gsc-webResult'})
    #Iterate by post list on current page
    for card in cards_array:
        name = card.find('a', {'class': 'gs-title'}).text
        text = card.find('div', {'class': 'gs-snippet'}).text
        url = card.find('div', {'class': 'gs-visibleUrl-long'}).text
        date = None
        pic=None
        try:
            pic = card.find('img', {'class': 'gs-image'})['src']
        except:
            pass
        #Creating post info set
        post = {
            'date_of_add': parsing_date,
            'post': url, 
            'pic': pic, 
            'name': name,
            'date': date,
            'text': text,
            'keyword': keyword,
            'sentiment': None,
            'social_name': social_name
        }
        posts.append(post)
    time.sleep(3)
    browser.save_screenshot("./screenshot.png")


#Iterating by keyords
for keyword in keywords:
    #Generate URL for parsing with keyword and social params
    params = '?q=' + keyword + ''.join(['&' + str(p) + '=on' for p in sites])
    url = f'{baseurls[0]}/{methods[0]}/' + params
    print(url)
    #Locate by generated URL
    browser.get(url)
    time.sleep(2)
    browser.save_screenshot("./screenshot.png")
    #Iterate by social data by him iframe columns
    for social in browser.find_elements_by_class_name('iframe-column'):
        social_name = social.find_element_by_class_name('network-name').text
        social_iframe = social.find_element_by_tag_name('iframe')
        browser.switch_to.frame(social_iframe)
        #Parsing the first page
        parse_list(browser, social_name, keyword, page_num = 1)
        #Parsing other pages
        for page_num in range(2, page_to + 1, 1):
            page_link = browser.find_element_by_xpath(f"//div[@aria-label='Page {page_num}']")
            browser.execute_script("arguments[0].click();", page_link)
            parse_list(browser, social_name, keyword, page_num)
        #Switch browser content from iframe to main frame
        browser.switch_to.default_content()

print(posts)

browser.quit()

Пожалуйста, оцените материал

WebSofter

Web - технологии