diff --git a/zotero_automator.py b/zotero_automator.py index b0a3443..f25d574 100644 --- a/zotero_automator.py +++ b/zotero_automator.py @@ -1,259 +1,504 @@ +import argparse import asyncio -from playwright.async_api import async_playwright -from playwright_stealth import Stealth +import io +import json import os +import tempfile import urllib.request import zipfile -import io -import sys -import argparse +from datetime import datetime -EXTENSION_ID = 'ekhagklcjbdpajgpjgmbionohlpdbjgc' +from patchright.async_api import async_playwright + +EXTENSION_ID = "ekhagklcjbdpajgpjgmbionohlpdbjgc" +EXTENSION_URL = ( + "https://clients2.google.com/service/update2/crx" + "?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64" + "&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US" + f"&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc" +) +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +EXTENSION_DIR = os.path.join(BASE_DIR, "zotero_extension") + +WINDOW_SIZE = {"width": 1280, "height": 800} +TRANSLATOR_WAIT_SECONDS = 3 +WELCOME_TAB_WAIT_SECONDS = 2 +POST_SAVE_WAIT_SECONDS = 5 +SERVICE_WORKER_POLL_ATTEMPTS = 60 +SERVICE_WORKER_POLL_INTERVAL = 0.5 +SERVICE_WORKER_DEBUG_ATTEMPTS = {0, 5, 10, 20, 40, 59} + +SAVE_SCRIPT = r""" +async ({ libraryName, collectionId }) => { + const debug = []; + const pushDebug = (label, value = null) => debug.push({ label, value }); + + function normalizeCollectionId(value) { + if (!value) return null; + const trimmed = value.trim(); + if (/^[cC]\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`; + if (/^\d+$/.test(trimmed)) return `C${trimmed}`; + return trimmed; + } + + function summarizeTargets(targets) { + return targets.map((target) => ({ + id: target.id, + name: target.name, + level: target.level + })); + } + + function summarizeTranslators(translators = []) { + return translators.map((translator) => ({ + translatorID: translator.translatorID, + label: translator.label, + itemType: translator.itemType, + priority: translator.priority + })); + } + + function findOwningLibrary(targets, targetId) { + const index = targets.findIndex((target) => target.id === targetId); + if (index === -1) return null; + + let level = targets[index].level || 0; + for (let i = index - 1; i >= 0; i -= 1) { + const candidate = targets[i]; + const candidateLevel = candidate.level || 0; + if (candidateLevel >= level) continue; + if (candidate.id.startsWith("L")) return candidate; + level = candidateLevel; + } + + return targets[index].id.startsWith("L") ? targets[index] : null; + } + + async function resolveTarget() { + if (!libraryName && !collectionId) { + pushDebug("resolved target", null); + return null; + } + + const response = await Zotero.Connector.callMethod("getSelectedCollection", { + switchToReadableLibrary: true + }); + const targets = response.targets || []; + pushDebug("available targets", summarizeTargets(targets)); + + if (!targets.length) { + throw new Error("Zotero did not return any selectable targets."); + } + + let libraryTarget = null; + if (libraryName) { + const normalizedLibraryName = libraryName.trim().toLowerCase(); + const matches = targets.filter( + (target) => + target.id.startsWith("L") && + target.name.trim().toLowerCase() === normalizedLibraryName + ); + if (!matches.length) { + throw new Error(`Library '${libraryName}' was not found.`); + } + if (matches.length > 1) { + throw new Error(`Library '${libraryName}' is ambiguous.`); + } + libraryTarget = matches[0]; + } + + let collectionTarget = null; + if (collectionId) { + const normalizedCollectionId = normalizeCollectionId(collectionId); + collectionTarget = targets.find((target) => target.id === normalizedCollectionId); + if (!collectionTarget) { + throw new Error(`Collection '${collectionId}' was not found.`); + } + if (libraryTarget) { + const owningLibrary = findOwningLibrary(targets, collectionTarget.id); + if (!owningLibrary || owningLibrary.id !== libraryTarget.id) { + throw new Error( + `Collection '${collectionId}' does not belong to library '${libraryName}'.` + ); + } + } + } + + const target = collectionTarget || libraryTarget; + pushDebug("resolved target", target ? { id: target.id, name: target.name } : null); + return target; + } + + function summarizeTabInfo(tabInfo) { + if (!tabInfo) return null; + return { + url: tabInfo.url, + isPDF: Boolean(tabInfo.isPDF), + frameId: tabInfo.frameId, + translatorCount: tabInfo.translators?.length || 0, + translators: summarizeTranslators(tabInfo.translators) + }; + } + + function installSessionHooks(target) { + const originalCallMethodWithCookies = + Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector); + const originalSaveStandaloneAttachment = + Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver); + + async function applyTargetToSession(sessionID) { + if (!target || !sessionID) return; + pushDebug("apply target to session", { sessionID, targetId: target.id }); + await Zotero.Connector.callMethod("updateSession", { + sessionID, + target: target.id + }); + } + + Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) { + pushDebug("callMethodWithCookies request", { + method, + hasPayload: Boolean(payload), + sessionID: payload?.sessionID || null + }); + const result = await originalCallMethodWithCookies(method, payload, ...args); + pushDebug("callMethodWithCookies response", { method, result }); + if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) { + await applyTargetToSession(payload.sessionID); + } + return result; + }; + + if (originalSaveStandaloneAttachment) { + Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function( + attachment, + sessionID, + ...args + ) { + pushDebug("saveStandaloneAttachmentToZotero request", { + title: attachment?.title || null, + url: attachment?.url || null, + sessionID + }); + const result = await originalSaveStandaloneAttachment( + attachment, + sessionID, + ...args + ); + pushDebug("saveStandaloneAttachmentToZotero response", result); + await applyTargetToSession(sessionID); + return result; + }; + } + + return () => { + Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies; + if (originalSaveStandaloneAttachment) { + Zotero.ItemSaver.saveStandaloneAttachmentToZotero = + originalSaveStandaloneAttachment; + } + }; + } + + async function runSave() { + pushDebug("connector online", await Zotero.Connector.checkIsOnline()); + + const tabs = await chrome.tabs.query({ active: true, currentWindow: true }); + if (!tabs?.length) { + return { error: "No active tab found.", debug }; + } + + const tab = tabs[0]; + pushDebug("active tab", { + id: tab.id, + url: tab.url, + title: tab.title, + status: tab.status + }); + + const tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id); + pushDebug("tab info", summarizeTabInfo(tabInfo)); + + if (!tabInfo) { + return { error: "No translator or webpage saving options available.", debug }; + } + + if (tabInfo.translators?.length) { + const result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, { + fallbackOnFailure: true + }); + pushDebug("saveWithTranslator result", result); + return { ok: true, mode: "translator", result, debug }; + } + + const result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, { + snapshot: true + }); + pushDebug("saveAsWebpage result", result); + return { ok: true, mode: "webpage", result, debug }; + } + + try { + const target = await resolveTarget(); + const restoreHooks = installSessionHooks(target); + try { + const result = await runSave(); + return { ...result, target }; + } finally { + restoreHooks(); + } + } catch (error) { + pushDebug("caught error", { + message: error.message, + stack: error.stack + }); + return { error: error.message, debug }; + } +} +""" + + +def debug_log(label, value=None): + timestamp = datetime.now().strftime("%H:%M:%S") + if value is None: + print(f"[debug {timestamp}] {label}") + return + + if isinstance(value, (dict, list, tuple)): + try: + value = json.dumps(value, ensure_ascii=True, default=str, indent=2) + except TypeError: + value = repr(value) + + print(f"[debug {timestamp}] {label}: {value}") -# Fetch extension directly from the alternative Chrome Webstore endpoint -EXTENSION_URL = f"https://clients2.google.com/service/update2/crx?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc" -EXTENSION_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'zotero_extension') -USER_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome_profile') -print(EXTENSION_DIR) def setup_extension(): - """Downloads and unpacks the Zotero Connector Chrome extension if not already present.""" - if os.path.exists(EXTENSION_DIR) and os.path.exists(os.path.join(EXTENSION_DIR, 'manifest.json')): + """Download and unpack the Zotero Connector extension if needed.""" + manifest_path = os.path.join(EXTENSION_DIR, "manifest.json") + if os.path.exists(manifest_path): print("[*] Zotero Extension already unpacked locally.") return os.path.abspath(EXTENSION_DIR) - + print("[*] Downloading Zotero Connector") - req = urllib.request.Request( - EXTENSION_URL, - headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} + request = urllib.request.Request( + EXTENSION_URL, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}, ) - with urllib.request.urlopen(req) as response: + with urllib.request.urlopen(request) as response: data = response.read() print("[*] Unpacking CRX file...") - # .crx files are zip files with an extra header. Find the standard ZIP header (PK\x03\x04). - zip_start = data.find(b'PK\x03\x04') + zip_start = data.find(b"PK\x03\x04") if zip_start == -1: raise ValueError("Could not find ZIP header in downloaded CRX.") - + os.makedirs(EXTENSION_DIR, exist_ok=True) - with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as z: - z.extractall(EXTENSION_DIR) - + with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as archive: + archive.extractall(EXTENSION_DIR) + print("[*] Zotero Extension setup complete.") return os.path.abspath(EXTENSION_DIR) -async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None): - """Automates Chrome to load a URL and trigger Zotero Connector.""" - extension_path = setup_extension() - - print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...") - - # Prepare playwright arguments + +def get_browser_launch_config(extension_path, headless_mode): args = [ f"--disable-extensions-except={extension_path}", f"--load-extension={extension_path}", ] - - # Standard headless=True historically blocked extensions. - # We use headless=False by default or can pass `--headless=new` parameter. + if headless_mode == "new": args.append("--headless=new") playwright_headless = False elif headless_mode == "false": playwright_headless = False else: - playwright_headless = True # May ignore extensions depending on Chromium version - - async with Stealth().use_async(async_playwright()) as p: - browser_context = await p.chromium.launch_persistent_context( - USER_DATA_DIR, - headless=playwright_headless, - args=args, - viewport={'width': 1280, 'height': 800} - ) - - # Close any welcome tabs the extension might open on first run - await asyncio.sleep(2) - if len(browser_context.pages) > 1: - for p_ext in browser_context.pages[1:]: - await p_ext.close() - - # Use the primary tab - page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page() - - print(f"[*] Navigating to {url}...") - await page.goto(url, wait_until="load") - - print("[*] Page loaded. Waiting for Zotero translator to initialize...") - # Give Zotero connection translator a moment to inject and detect the metadata - await asyncio.sleep(3) - - print("[*] Finding Zotero Connector service worker...") - worker = None - for i in range(60): - for w in browser_context.service_workers: - if "background-worker.js" in w.url or "zotero" in w.url: - worker = w - break - if worker: - break - await asyncio.sleep(0.5) - if not worker: - print("[!] Could not find Zotero extension service worker.") - else: - assert worker is not None - print("[*] Triggering save via extension service worker evaluation...") - save_result = await worker.evaluate('''async ({ libraryName, collectionId }) => { - function normalizeCollectionId(value) { - if (!value) return null; - const trimmed = value.trim(); - if (/^[cC]\\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`; - if (/^\\d+$/.test(trimmed)) return `C${trimmed}`; - return trimmed; - } + playwright_headless = True - function findOwningLibrary(targets, targetId) { - let index = targets.findIndex((target) => target.id === targetId); - if (index === -1) return null; - let currentLevel = targets[index].level || 0; + return playwright_headless, args - for (let i = index - 1; i >= 0; i--) { - let candidate = targets[i]; - let candidateLevel = candidate.level || 0; - if (candidateLevel < currentLevel) { - if (candidate.id.startsWith("L")) { - return candidate; - } - currentLevel = candidateLevel; - } - } - return targets[index].id.startsWith("L") ? targets[index] : null; - } - async function resolveTarget() { - if (!libraryName && !collectionId) return null; +async def close_extra_pages(browser_context): + await asyncio.sleep(WELCOME_TAB_WAIT_SECONDS) + extra_pages = browser_context.pages[1:] + if not extra_pages: + return - let response = await Zotero.Connector.callMethod("getSelectedCollection", { switchToReadableLibrary: true }); - let targets = response.targets || []; - if (!targets.length) { - throw new Error("Zotero did not return any selectable targets."); - } + debug_log("closing extra tabs", [page.url for page in extra_pages]) + for extra_page in extra_pages: + await extra_page.close() - let libraryTarget = null; - if (libraryName) { - let normalizedLibraryName = libraryName.trim().toLowerCase(); - let matches = targets.filter((target) => - target.id.startsWith("L") && target.name.trim().toLowerCase() === normalizedLibraryName - ); - if (!matches.length) { - throw new Error(`Library '${libraryName}' was not found.`); - } - if (matches.length > 1) { - throw new Error(`Library '${libraryName}' is ambiguous.`); - } - libraryTarget = matches[0]; - } - let collectionTarget = null; - if (collectionId) { - let normalizedCollectionId = normalizeCollectionId(collectionId); - collectionTarget = targets.find((target) => target.id === normalizedCollectionId); - if (!collectionTarget) { - throw new Error(`Collection '${collectionId}' was not found.`); - } - if (libraryTarget) { - let owningLibrary = findOwningLibrary(targets, collectionTarget.id); - if (!owningLibrary || owningLibrary.id !== libraryTarget.id) { - throw new Error( - `Collection '${collectionId}' does not belong to library '${libraryName}'.` - ); - } - } - } +async def get_primary_page(browser_context): + await close_extra_pages(browser_context) + page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page() + debug_log( + "active page before navigation", + {"url": page.url, "page_count": len(browser_context.pages)}, + ) + return page - return collectionTarget || libraryTarget; - } - let target = await resolveTarget(); - let applyTargetToSession = async (sessionID) => { - if (!target || !sessionID) return; - await Zotero.Connector.callMethod("updateSession", { sessionID, target: target.id }); - }; +async def wait_for_service_worker(browser_context): + print("[*] Finding Zotero Connector service worker...") + for attempt in range(SERVICE_WORKER_POLL_ATTEMPTS): + workers = list(browser_context.service_workers) + if attempt in SERVICE_WORKER_DEBUG_ATTEMPTS: + debug_log( + "service worker poll", + { + "attempt": attempt + 1, + "known_workers": [worker.url for worker in workers], + }, + ) - let originalCallMethodWithCookies = Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector); - let originalSaveStandaloneAttachment = Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver); + for worker in workers: + if "background-worker.js" in worker.url or "zotero" in worker.url: + debug_log("selected service worker", worker.url) + return worker - Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) { - let result = await originalCallMethodWithCookies(method, payload, ...args); - if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) { - await applyTargetToSession(payload.sessionID); - } - return result; - }; + await asyncio.sleep(SERVICE_WORKER_POLL_INTERVAL) - if (originalSaveStandaloneAttachment) { - Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function(attachment, sessionID, ...args) { - let result = await originalSaveStandaloneAttachment(attachment, sessionID, ...args); - await applyTargetToSession(sessionID); - return result; - }; - } + return None - try { - let tabs = await chrome.tabs.query({ active: true, currentWindow: true }); - if (!tabs || tabs.length === 0) return {error: "No active tab found."}; - let tab = tabs[0]; - let tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id); - - if (tabInfo && tabInfo.translators && tabInfo.translators.length) { - let result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, {fallbackOnFailure: true}); - return { ok: true, mode: "translator", result, target }; - } else if (tabInfo) { - let result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, { snapshot: true }); - return { ok: true, mode: "webpage", result, target }; - } else { - return {error: "No translator or webpage saving options available."}; - } - } catch(e) { - return {error: e.message}; - } finally { - Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies; - if (originalSaveStandaloneAttachment) { - Zotero.ItemSaver.saveStandaloneAttachmentToZotero = originalSaveStandaloneAttachment; - } - } - }''', {"libraryName": library_name, "collectionId": collection_id}) - - if not save_result or "error" in save_result: - print(f"[!] Save trigger failed: {save_result.get('error') if save_result else 'Unknown error'}") - else: - save_mode = save_result.get("mode", "unknown") - returned = save_result.get("result") - target = save_result.get("target") - print(f"[*] Save completed successfully via {save_mode}.") - if target: - print(f"[*] Save target: {target.get('name')} ({target.get('id')})") - if returned is not None: - print(f"[*] Save returned: {returned}") - - print("[*] Operation finished. Closing browser.") - await browser_context.close() -if __name__ == "__main__": +async def navigate_to_page(page, url, browser_context): + print(f"[*] Navigating to {url}...") + response = await page.goto(url, wait_until="load") + debug_log( + "navigation result", + { + "response_url": response.url if response else None, + "status": response.status if response else None, + "final_page_url": page.url, + "title": await page.title(), + }, + ) + + print("[*] Page loaded. Waiting for Zotero translator to initialize...") + await asyncio.sleep(TRANSLATOR_WAIT_SECONDS) + debug_log( + "post-load page snapshot", + { + "url": page.url, + "title": await page.title(), + "service_workers": [worker.url for worker in browser_context.service_workers], + }, + ) + + +def log_save_result(save_result): + if not save_result or "error" in save_result: + error = save_result.get("error") if save_result else "Unknown error" + print(f"[!] Save trigger failed: {error}") + else: + print(f"[*] Save completed successfully via {save_result.get('mode', 'unknown')}.") + target = save_result.get("target") + if target: + print(f"[*] Save target: {target.get('name')} ({target.get('id')})") + if save_result.get("result") is not None: + print(f"[*] Save returned: {save_result['result']}") + + debug_log("save_result", save_result) + for index, entry in enumerate(save_result.get("debug", []), start=1): + debug_log(f"worker debug #{index} {entry.get('label')}", entry.get("value")) + + +async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None): + extension_path = setup_extension() + playwright_headless, browser_args = get_browser_launch_config(extension_path, headless_mode) + + print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...") + debug_log( + "launch configuration", + {"playwright_headless": playwright_headless, "args": browser_args}, + ) + + async with async_playwright() as playwright: + with tempfile.TemporaryDirectory(prefix="zotero-paper-fetcher-") as user_data_dir: + debug_log( + "save_to_zotero arguments", + { + "url": url, + "headless_mode": headless_mode, + "library_name": library_name, + "collection_id": collection_id, + "extension_path": extension_path, + "user_data_dir": user_data_dir, + }, + ) + browser_context = await playwright.chromium.launch_persistent_context( + user_data_dir, + headless=playwright_headless, + args=browser_args, + viewport=WINDOW_SIZE, + ) + + try: + debug_log( + "temporary context launched", + { + "initial_page_count": len(browser_context.pages), + "service_worker_count": len(browser_context.service_workers), + "user_data_dir": user_data_dir, + }, + ) + page = await get_primary_page(browser_context) + await navigate_to_page(page, url, browser_context) + + worker = await wait_for_service_worker(browser_context) + if not worker: + print("[!] Could not find Zotero extension service worker.") + return + + print("[*] Triggering save via extension service worker evaluation...") + save_result = await worker.evaluate( + SAVE_SCRIPT, + {"libraryName": library_name, "collectionId": collection_id}, + ) + log_save_result(save_result) + + print("[*] Waiting 5 seconds for any delayed connector activity...") + await asyncio.sleep(POST_SAVE_WAIT_SECONDS) + finally: + print("[*] Operation finished. Closing browser.") + await browser_context.close() + + +def parse_args(): parser = argparse.ArgumentParser(description="Automate Zotero Connector via Playwright.") - parser.add_argument("url", nargs="?", default="https://arxiv.org/abs/1706.03762", help="URL to save to Zotero") - parser.add_argument("--headed", action="store_true", help="Show browser UI visually instead of headless=new") + parser.add_argument( + "url", + nargs="?", + default="https://arxiv.org/abs/1706.03762", + help="URL to save to Zotero", + ) + parser.add_argument( + "--headed", + action="store_true", + help="Show browser UI visually instead of headless=new", + ) parser.add_argument("--library-name", help="Save into the library with this exact name") - parser.add_argument("--collection-id", help="Save into the collection with this ID, such as 13 or C13") - args = parser.parse_args() - - headless_arg = "false" if args.headed else "new" + parser.add_argument( + "--collection-id", + help="Save into the collection with this ID, such as 13 or C13", + ) + return parser.parse_args() + + +def main(): + print(EXTENSION_DIR) + args = parse_args() + headless_mode = "false" if args.headed else "new" asyncio.run( save_to_zotero( args.url, - headless_mode=headless_arg, + headless_mode=headless_mode, library_name=args.library_name, collection_id=args.collection_id, ) ) + + +if __name__ == "__main__": + main()