2 Copyright (c) 2006-2009 Bryan Tong Minh
4 Permission is hereby granted, free of charge, to any person
5 obtaining a copy of this software and associated documentation
6 files (the "Software"), to deal in the Software without
7 restriction, including without limitation the rights to use,
8 copy, modify, merge, publish, distribute, sublicense, and/or sell
9 copies of the Software, and to permit persons to whom the
10 Software is furnished to do so, subject to the following
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 OTHER DEALINGS IN THE SOFTWARE.
27 import urllib, urlparse
29 from hashlib import sha1
31 from sha import sha as sha1
35 from botbase import FlickrBotBase
37 import mwclient, mwclient.http
42 Failures = ['nd', 'nc']
46 '5': '{{cc-by-sa-2.0}}',
47 '7': '{{Flickr-no known copyright restrictions}}',
50 '10': '{{safesubst:Flickr-public domain mark/subst}}'
54 class FlickreviewR(FlickrBotBase):
56 FlickrBotBase.__init__(self)
57 self.FlickrStatic = {}
58 self.review_template_regex = re.compile(
59 r'(\{\{(?:flickr ?r?eviewR?)?(?:User\:FlickreviewR\/reviewed\-.*?)?\}\})', re.S | re.I)
60 self.cc_license_template_regex = re.compile(
61 r'\{\{(cc\-by(?:\-sa)?(?:\-[0-9]\.[0-9])?)(?:\|.*?)?\}\}', re.S | re.I)
62 self.bad_authors = re.findall(r"^[^#\s].+$", mwclient.page.Page(self.site, "User:FlickreviewR/bad-authors").text(), re.M)
65 # Functions part of the review process
66 def run(self, max = -1):
67 """ Main function. """
70 for image in self.site.Categories['Flickr review needed']:
71 if count == max: return
73 if image.namespace != 6: continue
77 review_result, data = self.review(image)
78 self.output(u'* [[:%s]] %s %s %s' % (image, review_result, data['license'], data['data']), True)
80 self.post_result(image, review_result, data)
81 except mwclient.InsufficientPermission:
82 self.output(u'Insufficient permission editing %s' % image.name)
84 except mwclient.APIError, e:
85 self.output(u'APIError editing %s' % image.name)
86 self.output(''.join(traceback.format_exception_only(type(e), e)))
88 except mwclient.EditError, e:
89 self.output(u'EditError editing %s' % image.name)
92 except requests.HTTPError, e:
93 self.output(u'HTTPError editing %s' % image.name)
97 self.output(u'Exception reviewing image %s' % image.name)
101 #self.store_author(data['author'])
102 #self.store_result(image.name, review_result, data['license'], data['photo_id'],
103 # data['author'][0], data['uploaded'])
107 def review(self, image):
108 """ Reviews an image.
110 (Results, Author, Photo_Id, License, Data)
113 return_data = {'author':('',''), 'photo_id':'', 'license':'', 'data':'', 'uploaded': False, 'license_wikitext': None}
116 categories = list(image.categories(generator = False))
117 templates = list(image.templates(generator = False))
118 imageinfo = image.imageinfo
119 photo_id = flickr_tools.get_photo_id(data)
121 # Check whether there is a link to a Flickr page
123 return_data['photo_id'] = photo_id
125 flickr_image = self.Flickr[photo_id]
127 return 'flickr_not_found', return_data
128 except TypeError: # videos
129 return 'size_not_found', return_data
131 author = (flickr_image['owner']['nsid'], flickr_image['owner']['username'])
132 return_data['author'] = author
135 return 'size_not_found', return_data
137 license_check, license = self.review_license((flickr_image['license'], self.Flickr.licenses[flickr_image['license']]), categories, templates)
138 return_data['license'] = license
140 # Fail early if license is bad, don't go human review
141 if license_check == 'failed':
142 return license_check, return_data
144 # Try reviewing by EXIF information
145 exif_passed, exif_failed, exif_unverifiable = self.review_exif(
146 imageinfo['metadata'], flickr_image)
148 size = self.get_unexif_size(imageinfo)
149 return_data['size'] = size
152 ## - The EXIF information is too limited to be certain
153 ## - The EXIF information does not match
154 ## - Uploading a high resolution version is possible
156 if len(exif_passed) < 10 or exif_failed or self.can_upload(size, flickr_image):
157 flickr_file = self.get_flickr_image(size, int(imageinfo['size']), flickr_image)
159 return 'size_not_found', return_data
161 hash_review = self.review_sha1(imageinfo['sha1'], flickr_file)
162 # If the MD5 review does not match and the EXIF comparison was not sufficient:
163 if not hash_review and (len(exif_passed) < 10 or exif_failed):
164 return 'hash_not_matching', return_data
166 # Images match, review license
168 if author[0] in self.bad_authors:
169 return 'bad_author', return_data
171 # "No known restrictions" special case
172 if flickr_image['license'] == u'7':
173 if author[0] == '8623220@N02':
174 return_data['license'] = 'pd'
175 return 'library_of_congress', return_data
176 elif author[0] == '24785917@N03':
177 return_data['license'] = 'pd'
178 return 'powerhouse_museum', return_data
180 # Try substituting the license tag if necessary
181 if license_check == 'passed_changed':
182 return_data['data'] = self.try_license_subst(image, license)
184 # Add the license tag if necessary
185 if not u'Template:License template tag' in templates and (
186 license_check in ['passed_changed', 'passed'] or
187 (license_check == 'public_domain_mark' and not u'Template:Flickr-public domain mark/layout' in templates)
189 return_data['license_wikitext'] = Review.Licenses.get(flickr_image['license'])
190 if license_check == 'passed_changed':
191 license_check = 'passed'
193 # If licensing is ok and the SHA1 review passed, try uploading hires
194 if license_check != 'failed' and hash_review:
196 hires = self.upload_hires(image.name, size, flickr_image)
197 except mwclient.APIError, e:
198 self.output(u'APIError uploading %s' % image.name)
199 self.output(''.join(traceback.format_exception_only(type(e), e)))
200 return 'size_not_found', return_data
202 return_data['uploaded'] = True
204 return license_check, return_data
206 # No Flickr link avaiable
207 return 'no_flickr_link', return_data
209 def review_exif(self, metadata, flickr_image):
210 """ Compares exif information from Commons and Flickr.
211 Returns a tuple containing (similar, disimilar, missing)
217 metadata = dict([(i['name'], i['value']) for i in metadata])
219 # Prepare a list of all tags and remove duplicates
221 tags.extend(metadata.iterkeys())
222 tags.extend(flickr_image.exif.iterkeys())
224 while tags.count(tag) > 1: tags.remove(tag)
227 passed, failed, unverifiable = [], [], []
230 if tag in metadata and tag in flickr_image.exif:
231 if unicode(metadata[tag]) == flickr_image.exif[tag]:
234 failed.append((tag, metadata[tag], flickr_image.exif[tag]))
236 unverifiable.append(tag)
238 return passed, failed, unverifiable
240 def review_sha1(self, commons_hash, flickr_file):
241 """ Compares a file with a certain SHA1 hash in base16. """
245 for chunk in flickr_file.iter_content(chunk_size=8192):
246 flickr_hash.update(chunk)
248 # Explicitly close files to avoid problems with httplib
251 return commons_hash == flickr_hash.hexdigest()
253 def review_license(self, (flickr_license_id, flickr_license), categories, templates):
254 """ Compares the licenses on Commons and Flickr. """
256 # Flickr images with license 7 should have a license template
257 # which categorizes in Category:Files from Flickr's 'The Commons'
258 if flickr_license_id == u'7':
259 if u"Category:Files from Flickr's 'The Commons'" in categories:
260 return 'passed', flickr_license
262 return 'passed_changed', flickr_license
263 # United States government work, no specific category
264 if flickr_license_id == u'8':
265 return 'passed', flickr_license
268 # Flickr images with license 9 should have a license template
269 # which categorizes in Category:CC-Zero
270 if flickr_license_id == u'9':
271 if u'Category:CC-Zero' in categories:
272 return 'passed', 'cc-zero'
274 return 'passed_changed', 'cc-zero'
276 # Flickr images with license 10 should have a license template
277 if flickr_license_id == u'10':
279 not u'Template:PD-Layout' in templates or
280 any(template in templates for template in [
282 u'Template:PD-author',
285 return 'public_domain_mark', flickr_license
287 return 'passed', flickr_license
289 # Unconditionally pass images with OTRS permission confirmed
290 if u'Category:Items with OTRS permission confirmed' in categories:
291 return 'passed', 'otrs'
294 flickr_license = flickr_license.split('-')
295 if flickr_license[0] == 'cc':
296 for item in flickr_license[1:-1]:
297 if item in Review.Failures:
298 # Image has one of the ND/NC licenses
299 return 'failed', '-'.join(flickr_license)
300 # Image is under a free CC license
301 if u'Category:' + u'-'.join(flickr_license).upper() in categories:
302 # License on Flickr and Commons are the same
303 return 'passed', '-'.join(flickr_license)
305 # License on Flickr and Commons differ
306 return 'passed_changed', '-'.join(flickr_license)
309 return 'failed', '-'.join(flickr_license)
312 # Various helper functions to get something from Flickr
314 def get_flickr_image(self, commons_image_size, file_size, flickr_image):
315 """ Return a file object containing the Flickr image
316 matching the size of the Commons image. """
317 for flickr_image_size, location in flickr_image.sizes:
318 if commons_image_size != flickr_image_size:
320 # Found an image with the same image size
321 # Perform a request to obtain the Content-Length header
322 r = requests.get(location, stream=True)
324 content_length = int(r.headers['content-length'])
326 content_length = len(r.content)
327 if r.status_code == 200 and content_length == file_size:
330 self.output(u'File sizes do not match. Commons: %s; Flickr: %s' % (file_size, content_length))
334 def upload_hires(self, name, commons_size, flickr_image):
335 """ Upload a hires version of a flickr image. """
337 # Only upload if it is sure that the current image is a thumbnail
338 if commons_size not in [size for size, loc in flickr_image.sizes]:
339 self.output(u'No matching image size found for %s' %name)
342 # Find the highest resolution image
343 hires = max(flickr_image.sizes, key = lambda i: i[0][0] * i[0][1])
344 if hires[0][0] > commons_size[0]:
345 # Upload if it is larger than the Commons image
346 hires_image = flickr_tools.download_temporary(flickr_image)
348 # Perform rotation if necessary
349 rotation = flickr_tools.get_rotation(flickr_image)
351 self.output(u'Rotated %s %s degrees' % (name, rotation))
352 hires_image = flickr_tools.rotate(hires_image,
353 rotation, self.config['jpegtran'])
356 self.site.upload(hires_image, name[name.find(':') + 1:],
357 'Replacing image by its original image from Flickr', ignore = True)
358 except mwclient.APIError as e:
359 if e.code == 'fileexists-no-change':
360 self.output(u'File has no change... for some reason')
366 def can_upload(self, commons_size, flickr_image):
367 """Only upload in case the current version matches some version from Flickr!"""
368 return (commons_size in [size for size, loc in flickr_image.sizes]) and \
369 (commons_size[0] != max(flickr_image.sizes, key = lambda i: i[0][0]))
372 # Store and post functions
374 def store_result(self, image, review_result, license, photo_id, author_nsid, upload):
375 """ Push the result to the database. """
378 if not photo_id: photo_id = 0
379 if not author_nsid: author_nsid = None
380 if not license: license = None
381 if license == 'pd': review_result = 'passed'
383 self.database.insert('review',
384 ('rv_timestamp', 'rv_image', 'rv_result', 'rv_license', 'rv_photo_id', 'rv_nsid', 'rv_uploaded'),
385 (self.database.now(), image[6:].replace(' ', '_'), review_result, license,
386 photo_id, author_nsid, int(upload)))
387 self.database.commit()
389 def post_result(self, page, result, data):
390 """ Push the result to Commons. """
392 author = data['author'][1]
393 if re.search(r'[\[\]\|<>\{\}]', author):
394 author = u'<nowiki>' + author + u'</nowiki>'
396 if result == 'passed_changed' and data['data']:
397 extras += u'|uploadlicense=%s' % data['data']
398 t_data = u'{{FlickreviewR|status=%s|author=%s|sourceurl=https://flickr.com/photos/%s/%s|reviewdate=%s|reviewlicense=%s|reviewer={{subst:REVISIONUSER}}%s}}' % \
399 (result, author, data['author'][0], data['photo_id'], time.strftime('%Y-%m-%d %H:%M:%S'), data['license'], extras)
400 if data['license_wikitext']:
401 t_data = data['license_wikitext'] + u'\n' + t_data
404 if result == 'passed_changed' and data['data']:
405 text = text.replace(data['data'], data['license'])
407 text = self.review_template_regex.sub(t_data, text)
409 summary = '[[User:FlickreviewR 2|FlickreviewR 2]]: %s %s' % (result, data['license'])
410 page.save(text, summary = summary)
412 def try_license_subst(self, image, license):
413 match = self.cc_license_template_regex.search(image.text())
415 old_license = match.group(1)
419 def get_unexif_size(self, imageinfo):
420 """ Get the real image width and height without EXIF consideration for flickr compatibility. """
422 size = (int(imageinfo['width']), int(imageinfo['height']))
424 metadata = imageinfo.get('metadata') or []
426 if i['name'] == 'Orientation':
427 if i['value'] in [6, 8]: # 8 for 90, 3 for 180, 6 for 270
434 if __name__ == '__main__':