use patchright and temporary profile to fix issues, added debug output

This commit is contained in:
Mantao Huang 2026-03-12 10:58:14 -04:00
parent dd5c159108
commit 11cf99e63a

View File

@ -1,150 +1,102 @@
import argparse
import asyncio import asyncio
from playwright.async_api import async_playwright import io
from playwright_stealth import Stealth import json
import os import os
import tempfile
import urllib.request import urllib.request
import zipfile import zipfile
import io from datetime import datetime
import sys
import argparse
EXTENSION_ID = 'ekhagklcjbdpajgpjgmbionohlpdbjgc' from patchright.async_api import async_playwright
# Fetch extension directly from the alternative Chrome Webstore endpoint EXTENSION_ID = "ekhagklcjbdpajgpjgmbionohlpdbjgc"
EXTENSION_URL = f"https://clients2.google.com/service/update2/crx?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc" EXTENSION_URL = (
EXTENSION_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'zotero_extension') "https://clients2.google.com/service/update2/crx"
USER_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome_profile') "?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64"
print(EXTENSION_DIR) "&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US"
f"&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc"
)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
EXTENSION_DIR = os.path.join(BASE_DIR, "zotero_extension")
def setup_extension(): WINDOW_SIZE = {"width": 1280, "height": 800}
"""Downloads and unpacks the Zotero Connector Chrome extension if not already present.""" TRANSLATOR_WAIT_SECONDS = 3
if os.path.exists(EXTENSION_DIR) and os.path.exists(os.path.join(EXTENSION_DIR, 'manifest.json')): WELCOME_TAB_WAIT_SECONDS = 2
print("[*] Zotero Extension already unpacked locally.") POST_SAVE_WAIT_SECONDS = 5
return os.path.abspath(EXTENSION_DIR) SERVICE_WORKER_POLL_ATTEMPTS = 60
SERVICE_WORKER_POLL_INTERVAL = 0.5
SERVICE_WORKER_DEBUG_ATTEMPTS = {0, 5, 10, 20, 40, 59}
print("[*] Downloading Zotero Connector") SAVE_SCRIPT = r"""
req = urllib.request.Request( async ({ libraryName, collectionId }) => {
EXTENSION_URL, const debug = [];
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} const pushDebug = (label, value = null) => debug.push({ label, value });
)
with urllib.request.urlopen(req) as response:
data = response.read()
print("[*] Unpacking CRX file...")
# .crx files are zip files with an extra header. Find the standard ZIP header (PK\x03\x04).
zip_start = data.find(b'PK\x03\x04')
if zip_start == -1:
raise ValueError("Could not find ZIP header in downloaded CRX.")
os.makedirs(EXTENSION_DIR, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as z:
z.extractall(EXTENSION_DIR)
print("[*] Zotero Extension setup complete.")
return os.path.abspath(EXTENSION_DIR)
async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None):
"""Automates Chrome to load a URL and trigger Zotero Connector."""
extension_path = setup_extension()
print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...")
# Prepare playwright arguments
args = [
f"--disable-extensions-except={extension_path}",
f"--load-extension={extension_path}",
]
# Standard headless=True historically blocked extensions.
# We use headless=False by default or can pass `--headless=new` parameter.
if headless_mode == "new":
args.append("--headless=new")
playwright_headless = False
elif headless_mode == "false":
playwright_headless = False
else:
playwright_headless = True # May ignore extensions depending on Chromium version
async with Stealth().use_async(async_playwright()) as p:
browser_context = await p.chromium.launch_persistent_context(
USER_DATA_DIR,
headless=playwright_headless,
args=args,
viewport={'width': 1280, 'height': 800}
)
# Close any welcome tabs the extension might open on first run
await asyncio.sleep(2)
if len(browser_context.pages) > 1:
for p_ext in browser_context.pages[1:]:
await p_ext.close()
# Use the primary tab
page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page()
print(f"[*] Navigating to {url}...")
await page.goto(url, wait_until="load")
print("[*] Page loaded. Waiting for Zotero translator to initialize...")
# Give Zotero connection translator a moment to inject and detect the metadata
await asyncio.sleep(3)
print("[*] Finding Zotero Connector service worker...")
worker = None
for i in range(60):
for w in browser_context.service_workers:
if "background-worker.js" in w.url or "zotero" in w.url:
worker = w
break
if worker:
break
await asyncio.sleep(0.5)
if not worker:
print("[!] Could not find Zotero extension service worker.")
else:
assert worker is not None
print("[*] Triggering save via extension service worker evaluation...")
save_result = await worker.evaluate('''async ({ libraryName, collectionId }) => {
function normalizeCollectionId(value) { function normalizeCollectionId(value) {
if (!value) return null; if (!value) return null;
const trimmed = value.trim(); const trimmed = value.trim();
if (/^[cC]\\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`; if (/^[cC]\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`;
if (/^\\d+$/.test(trimmed)) return `C${trimmed}`; if (/^\d+$/.test(trimmed)) return `C${trimmed}`;
return trimmed; return trimmed;
} }
function findOwningLibrary(targets, targetId) { function summarizeTargets(targets) {
let index = targets.findIndex((target) => target.id === targetId); return targets.map((target) => ({
if (index === -1) return null; id: target.id,
let currentLevel = targets[index].level || 0; name: target.name,
level: target.level
}));
}
for (let i = index - 1; i >= 0; i--) { function summarizeTranslators(translators = []) {
let candidate = targets[i]; return translators.map((translator) => ({
let candidateLevel = candidate.level || 0; translatorID: translator.translatorID,
if (candidateLevel < currentLevel) { label: translator.label,
if (candidate.id.startsWith("L")) { itemType: translator.itemType,
return candidate; priority: translator.priority
} }));
currentLevel = candidateLevel;
} }
function findOwningLibrary(targets, targetId) {
const index = targets.findIndex((target) => target.id === targetId);
if (index === -1) return null;
let level = targets[index].level || 0;
for (let i = index - 1; i >= 0; i -= 1) {
const candidate = targets[i];
const candidateLevel = candidate.level || 0;
if (candidateLevel >= level) continue;
if (candidate.id.startsWith("L")) return candidate;
level = candidateLevel;
} }
return targets[index].id.startsWith("L") ? targets[index] : null; return targets[index].id.startsWith("L") ? targets[index] : null;
} }
async function resolveTarget() { async function resolveTarget() {
if (!libraryName && !collectionId) return null; if (!libraryName && !collectionId) {
pushDebug("resolved target", null);
return null;
}
const response = await Zotero.Connector.callMethod("getSelectedCollection", {
switchToReadableLibrary: true
});
const targets = response.targets || [];
pushDebug("available targets", summarizeTargets(targets));
let response = await Zotero.Connector.callMethod("getSelectedCollection", { switchToReadableLibrary: true });
let targets = response.targets || [];
if (!targets.length) { if (!targets.length) {
throw new Error("Zotero did not return any selectable targets."); throw new Error("Zotero did not return any selectable targets.");
} }
let libraryTarget = null; let libraryTarget = null;
if (libraryName) { if (libraryName) {
let normalizedLibraryName = libraryName.trim().toLowerCase(); const normalizedLibraryName = libraryName.trim().toLowerCase();
let matches = targets.filter((target) => const matches = targets.filter(
target.id.startsWith("L") && target.name.trim().toLowerCase() === normalizedLibraryName (target) =>
target.id.startsWith("L") &&
target.name.trim().toLowerCase() === normalizedLibraryName
); );
if (!matches.length) { if (!matches.length) {
throw new Error(`Library '${libraryName}' was not found.`); throw new Error(`Library '${libraryName}' was not found.`);
@ -157,13 +109,13 @@ async def save_to_zotero(url, headless_mode="new", library_name=None, collection
let collectionTarget = null; let collectionTarget = null;
if (collectionId) { if (collectionId) {
let normalizedCollectionId = normalizeCollectionId(collectionId); const normalizedCollectionId = normalizeCollectionId(collectionId);
collectionTarget = targets.find((target) => target.id === normalizedCollectionId); collectionTarget = targets.find((target) => target.id === normalizedCollectionId);
if (!collectionTarget) { if (!collectionTarget) {
throw new Error(`Collection '${collectionId}' was not found.`); throw new Error(`Collection '${collectionId}' was not found.`);
} }
if (libraryTarget) { if (libraryTarget) {
let owningLibrary = findOwningLibrary(targets, collectionTarget.id); const owningLibrary = findOwningLibrary(targets, collectionTarget.id);
if (!owningLibrary || owningLibrary.id !== libraryTarget.id) { if (!owningLibrary || owningLibrary.id !== libraryTarget.id) {
throw new Error( throw new Error(
`Collection '${collectionId}' does not belong to library '${libraryName}'.` `Collection '${collectionId}' does not belong to library '${libraryName}'.`
@ -172,20 +124,45 @@ async def save_to_zotero(url, headless_mode="new", library_name=None, collection
} }
} }
return collectionTarget || libraryTarget; const target = collectionTarget || libraryTarget;
pushDebug("resolved target", target ? { id: target.id, name: target.name } : null);
return target;
} }
let target = await resolveTarget(); function summarizeTabInfo(tabInfo) {
let applyTargetToSession = async (sessionID) => { if (!tabInfo) return null;
if (!target || !sessionID) return; return {
await Zotero.Connector.callMethod("updateSession", { sessionID, target: target.id }); url: tabInfo.url,
isPDF: Boolean(tabInfo.isPDF),
frameId: tabInfo.frameId,
translatorCount: tabInfo.translators?.length || 0,
translators: summarizeTranslators(tabInfo.translators)
}; };
}
let originalCallMethodWithCookies = Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector); function installSessionHooks(target) {
let originalSaveStandaloneAttachment = Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver); const originalCallMethodWithCookies =
Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector);
const originalSaveStandaloneAttachment =
Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver);
async function applyTargetToSession(sessionID) {
if (!target || !sessionID) return;
pushDebug("apply target to session", { sessionID, targetId: target.id });
await Zotero.Connector.callMethod("updateSession", {
sessionID,
target: target.id
});
}
Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) { Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) {
let result = await originalCallMethodWithCookies(method, payload, ...args); pushDebug("callMethodWithCookies request", {
method,
hasPayload: Boolean(payload),
sessionID: payload?.sessionID || null
});
const result = await originalCallMethodWithCookies(method, payload, ...args);
pushDebug("callMethodWithCookies response", { method, result });
if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) { if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) {
await applyTargetToSession(payload.sessionID); await applyTargetToSession(payload.sessionID);
} }
@ -193,67 +170,335 @@ async def save_to_zotero(url, headless_mode="new", library_name=None, collection
}; };
if (originalSaveStandaloneAttachment) { if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function(attachment, sessionID, ...args) { Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function(
let result = await originalSaveStandaloneAttachment(attachment, sessionID, ...args); attachment,
sessionID,
...args
) {
pushDebug("saveStandaloneAttachmentToZotero request", {
title: attachment?.title || null,
url: attachment?.url || null,
sessionID
});
const result = await originalSaveStandaloneAttachment(
attachment,
sessionID,
...args
);
pushDebug("saveStandaloneAttachmentToZotero response", result);
await applyTargetToSession(sessionID); await applyTargetToSession(sessionID);
return result; return result;
}; };
} }
try { return () => {
let tabs = await chrome.tabs.query({ active: true, currentWindow: true });
if (!tabs || tabs.length === 0) return {error: "No active tab found."};
let tab = tabs[0];
let tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id);
if (tabInfo && tabInfo.translators && tabInfo.translators.length) {
let result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, {fallbackOnFailure: true});
return { ok: true, mode: "translator", result, target };
} else if (tabInfo) {
let result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, { snapshot: true });
return { ok: true, mode: "webpage", result, target };
} else {
return {error: "No translator or webpage saving options available."};
}
} catch(e) {
return {error: e.message};
} finally {
Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies; Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies;
if (originalSaveStandaloneAttachment) { if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero = originalSaveStandaloneAttachment; Zotero.ItemSaver.saveStandaloneAttachmentToZotero =
originalSaveStandaloneAttachment;
} }
};
} }
}''', {"libraryName": library_name, "collectionId": collection_id})
if not save_result or "error" in save_result: async function runSave() {
print(f"[!] Save trigger failed: {save_result.get('error') if save_result else 'Unknown error'}") pushDebug("connector online", await Zotero.Connector.checkIsOnline());
const tabs = await chrome.tabs.query({ active: true, currentWindow: true });
if (!tabs?.length) {
return { error: "No active tab found.", debug };
}
const tab = tabs[0];
pushDebug("active tab", {
id: tab.id,
url: tab.url,
title: tab.title,
status: tab.status
});
const tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id);
pushDebug("tab info", summarizeTabInfo(tabInfo));
if (!tabInfo) {
return { error: "No translator or webpage saving options available.", debug };
}
if (tabInfo.translators?.length) {
const result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, {
fallbackOnFailure: true
});
pushDebug("saveWithTranslator result", result);
return { ok: true, mode: "translator", result, debug };
}
const result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, {
snapshot: true
});
pushDebug("saveAsWebpage result", result);
return { ok: true, mode: "webpage", result, debug };
}
try {
const target = await resolveTarget();
const restoreHooks = installSessionHooks(target);
try {
const result = await runSave();
return { ...result, target };
} finally {
restoreHooks();
}
} catch (error) {
pushDebug("caught error", {
message: error.message,
stack: error.stack
});
return { error: error.message, debug };
}
}
"""
def debug_log(label, value=None):
timestamp = datetime.now().strftime("%H:%M:%S")
if value is None:
print(f"[debug {timestamp}] {label}")
return
if isinstance(value, (dict, list, tuple)):
try:
value = json.dumps(value, ensure_ascii=True, default=str, indent=2)
except TypeError:
value = repr(value)
print(f"[debug {timestamp}] {label}: {value}")
def setup_extension():
"""Download and unpack the Zotero Connector extension if needed."""
manifest_path = os.path.join(EXTENSION_DIR, "manifest.json")
if os.path.exists(manifest_path):
print("[*] Zotero Extension already unpacked locally.")
return os.path.abspath(EXTENSION_DIR)
print("[*] Downloading Zotero Connector")
request = urllib.request.Request(
EXTENSION_URL,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"},
)
with urllib.request.urlopen(request) as response:
data = response.read()
print("[*] Unpacking CRX file...")
zip_start = data.find(b"PK\x03\x04")
if zip_start == -1:
raise ValueError("Could not find ZIP header in downloaded CRX.")
os.makedirs(EXTENSION_DIR, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as archive:
archive.extractall(EXTENSION_DIR)
print("[*] Zotero Extension setup complete.")
return os.path.abspath(EXTENSION_DIR)
def get_browser_launch_config(extension_path, headless_mode):
args = [
f"--disable-extensions-except={extension_path}",
f"--load-extension={extension_path}",
]
if headless_mode == "new":
args.append("--headless=new")
playwright_headless = False
elif headless_mode == "false":
playwright_headless = False
else: else:
save_mode = save_result.get("mode", "unknown") playwright_headless = True
returned = save_result.get("result")
return playwright_headless, args
async def close_extra_pages(browser_context):
await asyncio.sleep(WELCOME_TAB_WAIT_SECONDS)
extra_pages = browser_context.pages[1:]
if not extra_pages:
return
debug_log("closing extra tabs", [page.url for page in extra_pages])
for extra_page in extra_pages:
await extra_page.close()
async def get_primary_page(browser_context):
await close_extra_pages(browser_context)
page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page()
debug_log(
"active page before navigation",
{"url": page.url, "page_count": len(browser_context.pages)},
)
return page
async def wait_for_service_worker(browser_context):
print("[*] Finding Zotero Connector service worker...")
for attempt in range(SERVICE_WORKER_POLL_ATTEMPTS):
workers = list(browser_context.service_workers)
if attempt in SERVICE_WORKER_DEBUG_ATTEMPTS:
debug_log(
"service worker poll",
{
"attempt": attempt + 1,
"known_workers": [worker.url for worker in workers],
},
)
for worker in workers:
if "background-worker.js" in worker.url or "zotero" in worker.url:
debug_log("selected service worker", worker.url)
return worker
await asyncio.sleep(SERVICE_WORKER_POLL_INTERVAL)
return None
async def navigate_to_page(page, url, browser_context):
print(f"[*] Navigating to {url}...")
response = await page.goto(url, wait_until="load")
debug_log(
"navigation result",
{
"response_url": response.url if response else None,
"status": response.status if response else None,
"final_page_url": page.url,
"title": await page.title(),
},
)
print("[*] Page loaded. Waiting for Zotero translator to initialize...")
await asyncio.sleep(TRANSLATOR_WAIT_SECONDS)
debug_log(
"post-load page snapshot",
{
"url": page.url,
"title": await page.title(),
"service_workers": [worker.url for worker in browser_context.service_workers],
},
)
def log_save_result(save_result):
if not save_result or "error" in save_result:
error = save_result.get("error") if save_result else "Unknown error"
print(f"[!] Save trigger failed: {error}")
else:
print(f"[*] Save completed successfully via {save_result.get('mode', 'unknown')}.")
target = save_result.get("target") target = save_result.get("target")
print(f"[*] Save completed successfully via {save_mode}.")
if target: if target:
print(f"[*] Save target: {target.get('name')} ({target.get('id')})") print(f"[*] Save target: {target.get('name')} ({target.get('id')})")
if returned is not None: if save_result.get("result") is not None:
print(f"[*] Save returned: {returned}") print(f"[*] Save returned: {save_result['result']}")
debug_log("save_result", save_result)
for index, entry in enumerate(save_result.get("debug", []), start=1):
debug_log(f"worker debug #{index} {entry.get('label')}", entry.get("value"))
async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None):
extension_path = setup_extension()
playwright_headless, browser_args = get_browser_launch_config(extension_path, headless_mode)
print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...")
debug_log(
"launch configuration",
{"playwright_headless": playwright_headless, "args": browser_args},
)
async with async_playwright() as playwright:
with tempfile.TemporaryDirectory(prefix="zotero-paper-fetcher-") as user_data_dir:
debug_log(
"save_to_zotero arguments",
{
"url": url,
"headless_mode": headless_mode,
"library_name": library_name,
"collection_id": collection_id,
"extension_path": extension_path,
"user_data_dir": user_data_dir,
},
)
browser_context = await playwright.chromium.launch_persistent_context(
user_data_dir,
headless=playwright_headless,
args=browser_args,
viewport=WINDOW_SIZE,
)
try:
debug_log(
"temporary context launched",
{
"initial_page_count": len(browser_context.pages),
"service_worker_count": len(browser_context.service_workers),
"user_data_dir": user_data_dir,
},
)
page = await get_primary_page(browser_context)
await navigate_to_page(page, url, browser_context)
worker = await wait_for_service_worker(browser_context)
if not worker:
print("[!] Could not find Zotero extension service worker.")
return
print("[*] Triggering save via extension service worker evaluation...")
save_result = await worker.evaluate(
SAVE_SCRIPT,
{"libraryName": library_name, "collectionId": collection_id},
)
log_save_result(save_result)
print("[*] Waiting 5 seconds for any delayed connector activity...")
await asyncio.sleep(POST_SAVE_WAIT_SECONDS)
finally:
print("[*] Operation finished. Closing browser.") print("[*] Operation finished. Closing browser.")
await browser_context.close() await browser_context.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Automate Zotero Connector via Playwright.")
parser.add_argument("url", nargs="?", default="https://arxiv.org/abs/1706.03762", help="URL to save to Zotero")
parser.add_argument("--headed", action="store_true", help="Show browser UI visually instead of headless=new")
parser.add_argument("--library-name", help="Save into the library with this exact name")
parser.add_argument("--collection-id", help="Save into the collection with this ID, such as 13 or C13")
args = parser.parse_args()
headless_arg = "false" if args.headed else "new" def parse_args():
parser = argparse.ArgumentParser(description="Automate Zotero Connector via Playwright.")
parser.add_argument(
"url",
nargs="?",
default="https://arxiv.org/abs/1706.03762",
help="URL to save to Zotero",
)
parser.add_argument(
"--headed",
action="store_true",
help="Show browser UI visually instead of headless=new",
)
parser.add_argument("--library-name", help="Save into the library with this exact name")
parser.add_argument(
"--collection-id",
help="Save into the collection with this ID, such as 13 or C13",
)
return parser.parse_args()
def main():
print(EXTENSION_DIR)
args = parse_args()
headless_mode = "false" if args.headed else "new"
asyncio.run( asyncio.run(
save_to_zotero( save_to_zotero(
args.url, args.url,
headless_mode=headless_arg, headless_mode=headless_mode,
library_name=args.library_name, library_name=args.library_name,
collection_id=args.collection_id, collection_id=args.collection_id,
) )
) )
if __name__ == "__main__":
main()