| import os
|
| import time
|
| import shutil
|
| import logging
|
| import hashlib
|
| import urllib.request
|
| from datetime import datetime
|
| from selenium import webdriver
|
| from selenium.webdriver.common.keys import Keys
|
| from selenium.webdriver.common.by import By
|
| from selenium.webdriver.common.action_chains import ActionChains
|
| from selenium.webdriver.support.ui import WebDriverWait
|
| from selenium.webdriver.support import expected_conditions as EC
|
|
|
| try:
|
|
|
| # Set up logging
|
| log_file = os.path.join(os.getcwd(), 'download.log')
|
| logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
|
|
|
| # Define the URL to scrape
|
| url = "https://9gag.com/tag/funny/fresh"
|
|
|
| # Set up the Chrome driver
|
| options = webdriver.ChromeOptions()
|
| options.add_argument('--ignore-certificate-errors')
|
| options.add_argument('--ignore-ssl-errors')
|
| #options.add_argument('--headless')
|
| driver = webdriver.Chrome(options=options)
|
|
|
| # Navigate to the URL
|
| driver.get(url)
|
|
|
| # Wait for 5 seconds for the page to fully load
|
| time.sleep(5)
|
|
|
| # Scroll down several pages to load more images
|
| num_of_scrolls = 3
|
| for i in range(num_of_scrolls):
|
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| time.sleep(2)
|
|
|
| # Wait for 5 seconds for the possible dls to fully load
|
| time.sleep(5)
|
|
|
| # Find all images and videos with height greater than 320px and not in the .webm format
|
| images = driver.find_elements(By.XPATH, "//img[not(contains(@src, '.webm')) and contains(@style, 'height:') and substring-before(substring-after(@style, 'height:'), 'px') > 320]")
|
| videos = driver.find_elements(By.CSS_SELECTOR, "video[not(contains(@src, '.webm')) and @height > 320]")
|
|
|
| # Combine the images and videos into a single list
|
| posts = images + videos
|
|
|
| # Download the posts that meet the specified criteria
|
| for post in posts:
|
| # Check if we have already downloaded this post
|
| post_url = post.get_attribute("src") or post.get_attribute("srcObject").get("url")
|
| print(f"Downloading {post_url}")
|
| if os.path.isfile(post_url):
|
| logging.warning(f"Post already downloaded: {post_url}")
|
| continue
|
|
|
| # Download the post
|
| extension = post_url.split('.')[-1]
|
| if extension == "webm":
|
| continue # Skip video files in webm format
|
| filename = f"{datetime.now().strftime('%y%m%d%H%M%S')}_{post_url.split('/')[-1].split('.')[0]}.{extension}"
|
| download_dir = "z:\\gag"
|
| filepath = os.path.join(download_dir, filename)
|
| urllib.request.urlretrieve(post_url, filepath)
|
|
|
| # Log the download event
|
| logging.info(f"Downloaded post: {filepath}")
|
|
|
| if not posts:
|
| logging.warning("No suitable posts found")
|
|
|
| # Wait for downloads to complete
|
| expected_files = set([os.path.join(download_dir, f) for f in os.listdir(download_dir) if os.path.isfile(os.path.join(download_dir, f))])
|
| while True:
|
| if expected_files.issubset(set([os.path.join(download_dir, f) for f in os.listdir(download_dir)])):
|
| break
|
| time.sleep(2)
|
|
|
| # Quit the driver
|
| # driver.quit()
|
|
|
| # Log the end of the script
|
| logging.info("Script finished")
|
|
|
| except Exception as e:
|
| logging.error(f"An error occurred: {e}")
|