flickr: Fix regex replacement
[botscripts.git] / o / toolserver / bryan / flickr / bots / flickreviewr.py
1 """
2  Copyright (c) 2006-2009 Bryan Tong Minh
3  
4  Permission is hereby granted, free of charge, to any person
5  obtaining a copy of this software and associated documentation
6  files (the "Software"), to deal in the Software without
7  restriction, including without limitation the rights to use,
8  copy, modify, merge, publish, distribute, sublicense, and/or sell
9  copies of the Software, and to permit persons to whom the
10  Software is furnished to do so, subject to the following
11  conditions:
12  
13  The above copyright notice and this permission notice shall be
14  included in all copies or substantial portions of the Software.
15  
16  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23  OTHER DEALINGS IN THE SOFTWARE.
24 """
25 import sys, os, time
26 import traceback
27 import urllib, urlparse
28 try:
29         from hashlib import sha1
30 except ImportError:
31         from sha import sha as sha1
32
33 import re
34
35 from botbase import FlickrBotBase
36 import flickr_tools
37 import mwclient, mwclient.http
38 import requests
39 import flickr
40
41 class Review(object):
42         Failures = ['nd', 'nc']
43         
44         Licenses = {\
45                 '4':    '{{cc-by-2.0}}',
46                 '5':    '{{cc-by-sa-2.0}}',
47                 '7':    '{{Flickr-no known copyright restrictions}}',
48                 '8':    '{{PD-USGov}}',
49                 '9':    '{{cc-zero}}',
50                 '10':   '{{safesubst:Flickr-public domain mark/subst}}'
51         }
52                 
53
54 class FlickreviewR(FlickrBotBase):
55         def __init__(self):
56                 FlickrBotBase.__init__(self)
57                 self.FlickrStatic = {}
58                 self.review_template_regex = re.compile(
59                         r'(\{\{(?:flickr ?r?eviewR?)?(?:User\:FlickreviewR\/reviewed\-.*?)?\}\})', re.S | re.I)
60                 self.cc_license_template_regex = re.compile(
61                         r'\{\{(cc\-by(?:\-sa)?(?:\-[0-9]\.[0-9])?)(?:\|.*?)?\}\}', re.S | re.I)
62                 self.bad_authors = re.findall(r"^[^#\s].+$", mwclient.page.Page(self.site, "User:FlickreviewR/bad-authors").text(), re.M)
63         
64         
65         # Functions part of the review process
66         def run(self, max = -1):
67                 """ Main function. """
68                 
69                 count = 0
70                 for image in self.site.Categories['Flickr review needed']:
71                         if count == max: return
72                         
73                         if image.namespace != 6: continue
74                         
75                         try:
76                                 # Review
77                                 review_result, data = self.review(image)
78                                 self.output(u'* [[:%s]] %s %s %s' % (image, review_result, data['license'], data['data']), True)
79                                 # Post
80                                 self.post_result(image, review_result, data)
81                         except mwclient.InsufficientPermission:
82                                 self.output(u'Insufficient permission editing %s' % image.name)
83                                 continue
84                         except mwclient.APIError, e:
85                                 self.output(u'APIError editing %s' % image.name)
86                                 self.output(''.join(traceback.format_exception_only(type(e), e)))
87                                 continue
88                         except mwclient.EditError, e:
89                                 self.output(u'EditError editing %s' % image.name)
90                                 traceback.print_exc()
91                                 continue
92                         except requests.HTTPError, e:
93                                 self.output(u'HTTPError editing %s' % image.name)
94                                 traceback.print_exc()
95                                 continue
96                         except:
97                                 self.output(u'Exception reviewing image %s' % image.name)
98                                 raise
99                         
100                         # Store
101                         #self.store_author(data['author'])
102                         #self.store_result(image.name, review_result, data['license'], data['photo_id'], 
103                         #       data['author'][0], data['uploaded'])
104                                 
105                         count = count + 1
106         
107         def review(self, image):
108                 """ Reviews an image.
109                 Returns:
110                         (Results, Author, Photo_Id, License, Data)
111                 """
112         
113                 return_data = {'author':('',''), 'photo_id':'', 'license':'', 'data':'', 'uploaded': False, 'license_wikitext': None}
114                 data = image.text()
115                 if not data: return
116                 categories = list(image.categories(generator = False))
117                 templates = list(image.templates(generator = False))
118                 imageinfo = image.imageinfo
119                 photo_id = flickr_tools.get_photo_id(data)
120                 
121                 # Check whether there is a link to a Flickr page
122                 if photo_id:
123                         return_data['photo_id'] = photo_id
124                         try:
125                                 flickr_image = self.Flickr[photo_id]
126                         except KeyError:
127                                 return 'flickr_not_found', return_data
128                         except TypeError: # videos
129                                 return 'size_not_found', return_data
130
131                         author = (flickr_image['owner']['nsid'], flickr_image['owner']['username'])
132                         return_data['author'] = author
133                         
134                         if not imageinfo:
135                                 return 'size_not_found', return_data
136                         
137                         license_check, license = self.review_license((flickr_image['license'], self.Flickr.licenses[flickr_image['license']]), categories, templates)
138                         return_data['license'] = license
139
140                         # Fail early if license is bad, don't go human review
141                         if license_check == 'failed':
142                                 return license_check, return_data
143
144                         # Try reviewing by EXIF information
145                         exif_passed, exif_failed, exif_unverifiable = self.review_exif(
146                                 imageinfo['metadata'], flickr_image)
147                         
148                         size = self.get_unexif_size(imageinfo)
149                         return_data['size'] = size
150                         hash_review = None
151                         # Do SHA1 review if:
152                         ##      - The EXIF information is too limited to be certain
153                         ##      - The EXIF information does not match
154                         ##      - Uploading a high resolution version is possible
155                         
156                         if len(exif_passed) < 10 or exif_failed or self.can_upload(size, flickr_image):
157                                 flickr_file = self.get_flickr_image(size, int(imageinfo['size']), flickr_image)
158                                 if not flickr_file: 
159                                         return 'size_not_found', return_data
160                                 
161                                 hash_review = self.review_sha1(imageinfo['sha1'], flickr_file)
162                                 # If the MD5 review does not match and the EXIF comparison was not sufficient:
163                                 if not hash_review and (len(exif_passed) < 10 or exif_failed):
164                                         return 'hash_not_matching', return_data
165                         
166                         # Images match, review license
167                         
168                         if author[0] in self.bad_authors:
169                                 return 'bad_author', return_data
170                         
171                         # "No known restrictions" special case
172                         if flickr_image['license'] == u'7':
173                                 if author[0] == '8623220@N02':
174                                         return_data['license'] = 'pd'
175                                         return 'library_of_congress', return_data
176                                 elif author[0] == '24785917@N03':
177                                         return_data['license'] = 'pd'
178                                         return 'powerhouse_museum', return_data
179                         
180                         # Try substituting the license tag if necessary
181                         if license_check == 'passed_changed':
182                                 return_data['data'] = self.try_license_subst(image, license)
183
184                         # Add the license tag if necessary
185                         if not u'Template:License template tag' in templates and (
186                                         license_check in ['passed_changed', 'passed'] or
187                                         (license_check ==  'public_domain_mark' and not u'Template:Flickr-public domain mark/layout' in templates)
188                                         ):
189                                 return_data['license_wikitext'] = Review.Licenses.get(flickr_image['license'])
190                                 if license_check == 'passed_changed':
191                                         license_check = 'passed'
192
193                         # If licensing is ok and the SHA1 review passed, try uploading hires
194                         if license_check != 'failed' and hash_review:
195                                 try:
196                                         hires = self.upload_hires(image.name, size, flickr_image)
197                                 except mwclient.APIError, e:
198                                         self.output(u'APIError uploading %s' % image.name)
199                                         self.output(''.join(traceback.format_exception_only(type(e), e)))
200                                         return 'size_not_found', return_data
201                                 if hires: 
202                                         return_data['uploaded'] = True
203
204                         return license_check, return_data
205                 else:
206                         # No Flickr link avaiable
207                         return 'no_flickr_link', return_data
208         
209         def review_exif(self, metadata, flickr_image):
210                 """ Compares exif information from Commons and Flickr.
211                     Returns a tuple containing (similar, disimilar, missing)
212                     exif tags.
213                 """
214                 if metadata is None: 
215                         metadata = {}
216                 else:
217                         metadata = dict([(i['name'], i['value']) for i in metadata])
218                         
219                 # Prepare a list of all tags and remove duplicates
220                 tags = []
221                 tags.extend(metadata.iterkeys())
222                 tags.extend(flickr_image.exif.iterkeys())
223                 for tag in tags[:]:
224                         while tags.count(tag) > 1: tags.remove(tag)
225                         
226                 # Init return value
227                 passed, failed, unverifiable = [], [], []
228                 
229                 for tag in tags:
230                         if tag in metadata and tag in flickr_image.exif:
231                                 if unicode(metadata[tag]) == flickr_image.exif[tag]:
232                                         passed.append(tag)
233                                 else:
234                                         failed.append((tag, metadata[tag], flickr_image.exif[tag]))
235                         else:
236                                 unverifiable.append(tag)
237                 
238                 return passed, failed, unverifiable
239                 
240         def review_sha1(self, commons_hash, flickr_file):
241                 """ Compares a file with a certain SHA1 hash in base16. """
242                 
243                 flickr_hash = sha1()
244                 
245                 for chunk in flickr_file.iter_content(chunk_size=8192):
246                         flickr_hash.update(chunk)
247                 
248                 # Explicitly close files to avoid problems with httplib
249                 flickr_file.close()
250                 
251                 return commons_hash == flickr_hash.hexdigest()
252                 
253         def review_license(self, (flickr_license_id, flickr_license), categories, templates):
254                 """ Compares the licenses on Commons and Flickr. """
255                 
256                 # Flickr images with license 7 should have a license template
257                 # which categorizes in Category:Files from Flickr's 'The Commons'
258                 if flickr_license_id == u'7':
259                         if u"Category:Files from Flickr's 'The Commons'" in categories:
260                                 return 'passed', flickr_license
261                         else:
262                                 return 'passed_changed', flickr_license
263                 # United States government work, no specific category
264                 if flickr_license_id == u'8':
265                         return 'passed', flickr_license
266                         
267                         
268                 # Flickr images with license 9 should have a license template
269                 # which categorizes in Category:CC-Zero
270                 if flickr_license_id == u'9':
271                         if u'Category:CC-Zero' in categories:
272                                 return 'passed', 'cc-zero'
273                         else:
274                                 return 'passed_changed', 'cc-zero'
275                         
276                 # Flickr images with license 10 should have a license template
277                 if flickr_license_id == u'10':
278                         if (
279                                 not u'Template:PD-Layout' in templates or
280                                 any(template in templates for template in [
281                                         u'Template:PD-self',
282                                         u'Template:PD-author',
283                                 ])
284                         ):
285                                 return 'public_domain_mark', flickr_license
286                         else:
287                                 return 'passed', flickr_license
288                         
289                 # Unconditionally pass images with OTRS permission confirmed    
290                 if u'Category:Items with OTRS permission confirmed' in categories:
291                         return 'passed', 'otrs'
292                 
293                 # Verify CC licenses
294                 flickr_license = flickr_license.split('-')
295                 if flickr_license[0] == 'cc':
296                         for item in flickr_license[1:-1]:
297                                 if item in Review.Failures:
298                                         # Image has one of the ND/NC licenses
299                                         return 'failed', '-'.join(flickr_license)
300                         # Image is under a free CC license
301                         if u'Category:' + u'-'.join(flickr_license).upper() in categories:
302                                 # License on Flickr and Commons are the same
303                                 return 'passed', '-'.join(flickr_license)
304                         else:
305                                 # License on Flickr and Commons differ
306                                 return 'passed_changed', '-'.join(flickr_license)
307                 else:
308                         # A non CC license
309                         return 'failed', '-'.join(flickr_license)
310                 
311         
312         # Various helper functions to get something from Flickr
313
314         def get_flickr_image(self, commons_image_size, file_size, flickr_image):
315                 """ Return a file object containing the Flickr image 
316                     matching the size of the Commons image. """
317                 for flickr_image_size, location in flickr_image.sizes:
318                         if commons_image_size != flickr_image_size:
319                                 continue
320                         # Found an image with the same image size
321                         # Perform a request to obtain the Content-Length header
322                         r = requests.get(location, stream=True)
323                         try:
324                                 content_length = int(r.headers['content-length'])
325                         except KeyError:
326                                 content_length = len(r.content)
327                         if r.status_code == 200 and content_length == file_size:
328                                 # Get the file
329                                 return r
330                         self.output(u'File sizes do not match. Commons: %s; Flickr: %s' % (file_size, content_length))
331         
332         # Upload functions
333         
334         def upload_hires(self, name, commons_size, flickr_image):
335                 """ Upload a hires version of a flickr image. """
336                 
337                 # Only upload if it is sure that the current image is a thumbnail
338                 if commons_size not in [size for size, loc in flickr_image.sizes]:
339                         self.output(u'No matching image size found for %s' %name)
340                         return
341                 
342                 # Find the highest resolution image
343                 hires = max(flickr_image.sizes, key = lambda i: i[0][0] * i[0][1])
344                 if hires[0][0] > commons_size[0]:
345                         # Upload if it is larger than the Commons image
346                         hires_image = flickr_tools.download_temporary(flickr_image)
347                         
348                         # Perform rotation if necessary 
349                         rotation = flickr_tools.get_rotation(flickr_image)
350                         if rotation:
351                                 self.output(u'Rotated %s %s degrees' % (name, rotation))
352                                 hires_image = flickr_tools.rotate(hires_image, 
353                                                 rotation, self.config['jpegtran'])
354                         
355                         try:
356                                 self.site.upload(hires_image, name[name.find(':') + 1:],
357                                         'Replacing image by its original image from Flickr', ignore = True)
358                         except mwclient.APIError as e:
359                                 if e.code == 'fileexists-no-change':
360                                         self.output(u'File has no change... for some reason')
361                                         return
362                                 raise
363                         else:
364                                 return hires[0]
365                 
366         def can_upload(self, commons_size, flickr_image):
367                 """Only upload in case the current version matches some version from Flickr!"""
368                 return (commons_size in [size for size, loc in flickr_image.sizes]) and \
369                         (commons_size[0] != max(flickr_image.sizes, key = lambda i: i[0][0]))
370         
371         
372         # Store and post functions
373         
374         def store_result(self, image, review_result, license, photo_id, author_nsid, upload):
375                 """ Push the result to the database. """
376                 
377                 # Init defaults
378                 if not photo_id: photo_id = 0
379                 if not author_nsid: author_nsid = None
380                 if not license: license = None
381                 if license == 'pd': review_result = 'passed'
382                         
383                 self.database.insert('review',
384                         ('rv_timestamp', 'rv_image', 'rv_result', 'rv_license', 'rv_photo_id', 'rv_nsid', 'rv_uploaded'),
385                         (self.database.now(), image[6:].replace(' ', '_'), review_result, license,
386                                 photo_id, author_nsid, int(upload)))
387                 self.database.commit()
388                 
389         def post_result(self, page, result, data):
390                 """ Push the result to Commons. """
391                 
392                 author = data['author'][1]
393                 if re.search(r'[\[\]\|<>\{\}]', author):
394                         author = u'<nowiki>' + author + u'</nowiki>'
395                 extras = ''
396                 if result == 'passed_changed' and data['data']:
397                         extras += u'|uploadlicense=%s' % data['data']
398                 t_data = u'{{FlickreviewR|status=%s|author=%s|sourceurl=https://flickr.com/photos/%s/%s|reviewdate=%s|reviewlicense=%s|reviewer={{subst:REVISIONUSER}}%s}}' % \
399                         (result, author, data['author'][0], data['photo_id'], time.strftime('%Y-%m-%d %H:%M:%S'), data['license'], extras)
400                 if data['license_wikitext']:
401                         t_data = data['license_wikitext'] + u'\n' + t_data
402
403                 text = page.text()
404                 if result == 'passed_changed' and data['data']:
405                         text = text.replace(data['data'], data['license'])
406
407                 text = self.review_template_regex.sub(t_data, text)
408
409                 summary = '[[User:FlickreviewR 2|FlickreviewR 2]]: %s %s' % (result, data['license'])
410                 page.save(text, summary = summary)
411                 
412         def try_license_subst(self, image, license):
413                 match = self.cc_license_template_regex.search(image.text())
414                 if match:
415                         old_license = match.group(1) 
416                         return old_license
417                 return ''
418         
419         def get_unexif_size(self, imageinfo):
420                 """ Get the real image width and height without EXIF consideration for flickr compatibility. """
421                 
422                 size = (int(imageinfo['width']), int(imageinfo['height']))
423                 
424                 metadata = imageinfo.get('metadata') or []
425                 for i in metadata:
426                         if i['name'] == 'Orientation':
427                                 if i['value'] in [6, 8]: # 8 for 90, 3 for 180, 6 for 270
428                                         size = size[::-1]
429                                 break
430                 
431                 return size
432                 
433
434 if __name__ == '__main__': 
435         import sys
436         
437         fr = FlickreviewR()
438         if '-r' in sys.argv:
439                 fr.run()