diff --git a/Automation/src/GSoC-Organizations-Data/README.md b/Automation/src/GSoC-Organizations-Data/README.md new file mode 100644 index 00000000..1223c311 --- /dev/null +++ b/Automation/src/GSoC-Organizations-Data/README.md @@ -0,0 +1,15 @@ +# GSoC 2018 Organizations Data +## What this script does +This python script retrieves all the organizations names, the technologies they use for their open source projects, topic categories and topic names given in this +[page](https://summerofcode.withgoogle.com/archive/2018/organizations/) and converts it into a nice excel file. + +## Libraries Used +#### Selenium +#### XlsxWriter + +## Instructions +Run the following command in your terminal +```python +python scraper.py +``` + diff --git a/Automation/src/GSoC-Organizations-Data/requirements.txt b/Automation/src/GSoC-Organizations-Data/requirements.txt new file mode 100644 index 00000000..5364f71a --- /dev/null +++ b/Automation/src/GSoC-Organizations-Data/requirements.txt @@ -0,0 +1,4 @@ +pkg-resources==0.0.0 +selenium==3.141.0 +urllib3==1.25.3 +XlsxWriter==1.1.8 diff --git a/Automation/src/GSoC-Organizations-Data/results.xlsx b/Automation/src/GSoC-Organizations-Data/results.xlsx new file mode 100644 index 00000000..29f99c46 Binary files /dev/null and b/Automation/src/GSoC-Organizations-Data/results.xlsx differ diff --git a/Automation/src/GSoC-Organizations-Data/scraper.py b/Automation/src/GSoC-Organizations-Data/scraper.py new file mode 100644 index 00000000..12437aaa --- /dev/null +++ b/Automation/src/GSoC-Organizations-Data/scraper.py @@ -0,0 +1,53 @@ +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.common.exceptions import TimeoutException +import time +import xlsxwriter + +browser = webdriver.Firefox() +url="https://summerofcode.withgoogle.com/organizations/?sp-page=5" +browser.get(url) + +delay = 5 + +try: + elms = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'organization-card__container'))) + print("Page is ready!") + html=browser.page_source + workbook = xlsxwriter.Workbook('results.xlsx') + worksheet = workbook.add_worksheet() + row = 1 + col = 0 + bold = workbook.add_format({'bold': True}) + worksheet.set_column(0, 2, 70) + worksheet.set_column(3, 3, 150) + worksheet.write(0, 0, 'ORGANISATION NAME', bold) + worksheet.write(0, 1, 'TECHNOLOGIES', bold) + worksheet.write(0, 2, 'TOPIC CATEGORY', bold) + worksheet.write(0, 3, 'TOPIC NAMES', bold) + + orgs = browser.find_elements_by_class_name('organization-card__container') + for org in orgs: + org.click() + elms = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'organization__tag--topic'))) + org_name = browser.find_element_by_class_name('organization-card__title').text + worksheet.write(row, col, org_name) + tech_tags = browser.find_elements_by_class_name('organization__tag--technology') + tags_text = '' + for tag in tech_tags: + tags_text += tag.text + ',' + worksheet.write(row, col+1, tags_text) + topic_cat = browser.find_element_by_class_name('organization__tag--category').text + worksheet.write(row, col+2, topic_cat) + topics = browser.find_elements_by_class_name('organization__tag--topic') + topics_text ='' + for topic in topics: + topics_text += topic.text + ',' + worksheet.write(row, col+3, topics_text) + row += 1 + workbook.close() + +except TimeoutException: + print("Loading took too much time!") \ No newline at end of file