From 0db5699ddcca2ce93f1c8966ab001eeeca2efb62 Mon Sep 17 00:00:00 2001 From: Sam Wilson Date: Sun, 10 Nov 2024 07:33:33 +0800 Subject: [PATCH] Add export:category command Add a new command to export pages and files in a category. Bug: GH#238 --- README.md | 16 +++ bin/mwcli | 2 + i18n/en.json | 4 + src/Command/ExportCategoryCommand.php | 148 ++++++++++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 src/Command/ExportCategoryCommand.php diff --git a/README.md b/README.md index 2e33ea9..4ff811f 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,22 @@ Remove a site from the config file. * `--wiki` `-w` — The mwcli name of the wiki to use. Use sites:list to list all. *Required.* +### export:category + +Export pages and files in a category (and its subcategories). + + export:category [-c|--config [CONFIG]] [-w|--wiki WIKI] [-a|--category CATEGORY] [-d|--dest DEST] [-o|--only-author] + +* `--config` `-c` — Path of the Yaml config file to use. + Default: '[CWD]/config.yml' +* `--wiki` `-w` — The mwcli name of the wiki to use. Use sites:list to list all. + *Required.* +* `--category` `-a` — Name of the category to export (with or without the leading 'Category:', and in any language). + *Required.* +* `--dest` `-d` — The destination directory for exported files. + Default: '[CWD]/categories' +* `--only-author` `-o` — Export only where the given user is the original author of a page. + ### export:contribs Export a user's contributions. diff --git a/bin/mwcli b/bin/mwcli index 8451ac5..08ad9d5 100755 --- a/bin/mwcli +++ b/bin/mwcli @@ -4,6 +4,7 @@ require __DIR__.'/../vendor/autoload.php'; use Samwilson\MediaWikiCLI\Command\AuthCommand; +use Samwilson\MediaWikiCLI\Command\ExportCategoryCommand; use Samwilson\MediaWikiCLI\Command\ExportContribsCommand; use Samwilson\MediaWikiCLI\Command\ExportWikitextCommand; use Samwilson\MediaWikiCLI\Command\ExtensionInstallCommand; @@ -23,6 +24,7 @@ $application->add(new SitesInfoCommand()); $application->add(new SitesListCommand()); $application->add(new SitesRemoveCommand()); $application->add(new AuthCommand()); +$application->add(new ExportCategoryCommand()); $application->add(new ExportContribsCommand()); $application->add(new ExportWikitextCommand()); $application->add(new ExtensionInstallCommand()); diff --git a/i18n/en.json b/i18n/en.json index bef6c73..91886b6 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -15,6 +15,10 @@ "option-wiki-desc": "The mwcli name of the wiki to use. Use sites:list to list all.", "option-wiki-missing": "Please specify a wiki with the `--wiki` option.", + "command-export-category-desc": "Export pages and files in a category (and its subcategories).", + "option-category-desc": "Name of the category to export (with or without the leading 'Category:', and in any language).", + "export-cat-not-found": "Category not found: $1", + "command-export-contribs-desc": "Export a user's contributions.", "option-dest-desc": "The destination directory for exported files.", "option-user-desc": "Export contributions of this username.", diff --git a/src/Command/ExportCategoryCommand.php b/src/Command/ExportCategoryCommand.php new file mode 100644 index 0000000..33031ff --- /dev/null +++ b/src/Command/ExportCategoryCommand.php @@ -0,0 +1,148 @@ +setName( 'export:category' ); + $this->setDescription( $this->msg( 'command-export-category-desc' ) ); + $this->addOption( 'wiki', 'w', InputOption::VALUE_REQUIRED, $this->msg( 'option-wiki-desc' ) ); + $this->addOption( 'category', 'a', InputOption::VALUE_REQUIRED, $this->msg( 'option-category-desc' ) ); + $this->addOption( 'dest', 'd', InputOption::VALUE_REQUIRED, $this->msg( 'option-dest-desc' ), + $this->getConfigDirDefault() . 'categories' ); + } + + public function execute( InputInterface $input, OutputInterface $output ) { + $ret = parent::execute( $input, $output ); + if ( $ret ) { + return $ret; + } + $siteInfo = $this->getSite( $input ); + if ( !$siteInfo ) { + return Command::FAILURE; + } + $this->api = $this->getApi( $siteInfo ); + $catTraverser = ( new MediawikiFactory( $this->api ) )->newCategoryTraverser(); + $catTraverser->addCallback( CategoryTraverser::CALLBACK_PAGE, [ $this, 'descender' ] ); + $catTraverser->addCallback( CategoryTraverser::CALLBACK_CATEGORY, [ $this, 'descender' ] ); + + // The category option can be with or without the namespace prefix, and it can be given as any of its aliases. + $categoryName = $input->getOption( 'category' ); + if ( !$categoryName ) { + $this->io->warning( 'Please set the --category option.' ); + return Command::FAILURE; + } + $catAliases = ( new NamespaceGetter( $this->api ) ) + ->getNamespaceByName( 'Category' ) + ->getAliases(); + $catPrefixes = array_merge( [ 'Category' ], $catAliases ); + foreach ( $catPrefixes as $catPrefix ) { + if ( str_starts_with( $categoryName, $catPrefix . ':' ) ) { + $categoryName = substr( $categoryName, strlen( $catPrefix ) + 1 ); + } + } + $categoryNamespaceId = 14; + $catTitle = new Title( 'Category:' . ucfirst( $categoryName ), $categoryNamespaceId ); + $this->io->writeln( 'Downloading ' . $catTitle->getText() ); + + $this->optionDest = $input->getOption( 'dest' ) . '/' . $siteInfo['id']; + + $catProps = [ + 'titles' => $catTitle->getText(), + 'action' => 'query', + 'prop' => 'info', + 'formatversion' => 2, + 'inprop' => 'url', + ]; + $cat = $this->api->request( ActionRequest::simpleGet( 'query', $catProps ) ); + $catInfo = reset( $cat['query']['pages'] ); + if ( isset( $catInfo['missing'] ) ) { + $this->io->error( $this->msg( 'export-cat-not-found', [ $catInfo['canonicalurl'] ] ) ); + return Command::FAILURE; + } + $catTraverser->descend( new Page( new PageIdentifier( $catTitle ) ) ); + return Command::SUCCESS; + } + + public function descender( Page $member, Page $rootCat ) { + $title = $member->getPageIdentifier()->getTitle()->getText(); + + // Sparate namespace and page names. + $firstColon = strpos( $title, ':' ); + $namespace = $firstColon ? substr( $title, 0, $firstColon ) : '(main)'; + $pageTitlePart = $firstColon ? substr( $title, $firstColon + 1 ) : $title; + $pageTitle = str_replace( ' ', '_', $pageTitlePart ); + + $this->io->writeln( "Downloading $title . . . " ); + $pageInfo = $this->api->request( ActionRequest::simpleGet( 'query', [ + 'prop' => 'imageinfo|revisions', + 'iiprop' => 'url|sha1|timestamp', + 'titles' => $title, + 'rvprop' => 'content', + 'rvslots' => 'main|mediainfo', + 'formatversion' => 2, + ] ) ); + + if ( !isset( $pageInfo['query']['pages'] ) ) { + echo "Unable to get $title\n"; + exit(); + } + $page = array_shift( $pageInfo['query']['pages'] ); + + // File. + if ( isset( $page['imageinfo'] ) ) { + $fileUrl = $page['imageinfo'][0]['url']; + $destFile = $this->optionDest . '/files/' . basename( $fileUrl ); + if ( !is_file( $destFile ) || sha1_file( $destFile ) !== $page['imageinfo'][0]['sha1'] ) { + if ( !is_dir( dirname( $destFile ) ) ) { + $this->io->writeln( 'Creating directory ' . dirname( $destFile ) ); + mkdir( dirname( $destFile ), 0755, true ); + } + $this->io->writeln( " File: $destFile" ); + ( new Client() )->get( $fileUrl, [ 'sink' => $destFile ] ); + } + } + + // Wikitext of the page. + $destWikitext = $this->optionDest . '/' . $namespace . '/' . $pageTitle . '.wikitext'; + $rev = reset( $page['revisions'] ); + $content = $rev['slots']['main']['content']; + if ( !empty( trim( $content ) ) ) { + if ( !is_dir( dirname( $destWikitext ) ) ) { + $this->io->writeln( 'Creating directory: ' . dirname( $destWikitext ) ); + mkdir( dirname( $destWikitext ), 0755, true ); + } + $this->io->writeln( " Wikitext: $destWikitext" ); + file_put_contents( $destWikitext, $content ); + } + + // MediaInfo JSON. + if ( isset( $rev['slots']['mediainfo']['content'] ) ) { + $destMediaInfo = $this->optionDest . '/' . $namespace . '/' . $pageTitle . '_mediainfo.json'; + $this->io->writeln( " Structured data: $destMediaInfo" ); + file_put_contents( $destMediaInfo, $rev['slots']['mediainfo']['content'] ); + } + } +}