diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 00000000..9d61de86 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +Python-programming-exercises1 \ No newline at end of file diff --git a/.idea/Python-programming-exercises1.iml b/.idea/Python-programming-exercises1.iml new file mode 100644 index 00000000..1be76f1e --- /dev/null +++ b/.idea/Python-programming-exercises1.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 00000000..d8210482 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..7b4d854c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..87ca0151 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/scopes/scope_settings.xml b/.idea/scopes/scope_settings.xml new file mode 100644 index 00000000..922003b8 --- /dev/null +++ b/.idea/scopes/scope_settings.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..94a25f7f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..7d49bc3b --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,501 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1421824925048 + + + 1421825011830 + + + 1421831811806 + + + 1421831973807 + + + 1421845405989 + + + 1421845852348 + + + 1421848857166 + + + 1422000203079 + + + 1422014853159 + + + 1422021116842 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/100+ Python challenging programming exercises.txt b/100+ Python challenging programming exercises.txt deleted file mode 100644 index 66c00215..00000000 --- a/100+ Python challenging programming exercises.txt +++ /dev/null @@ -1,2376 +0,0 @@ -100+ Python challenging programming exercises - -1. Level description -Level Description -Level 1 Beginner means someone who has just gone through an introductory Python course. He can solve some problems with 1 or 2 Python classes or functions. Normally, the answers could directly be found in the textbooks. -Level 2 Intermediate means someone who has just learned Python, but already has a relatively strong programming background from before. He should be able to solve problems which may involve 3 or 3 Python classes or functions. The answers cannot be directly be found in the textbooks. -Level 3 Advanced. He should use Python to solve more complex problem using more rich libraries functions and data structures and algorithms. He is supposed to solve the problem using several Python standard packages and advanced techniques. - -2. Problem template - -#----------------------------------------# -Question -Hints -Solution - -3. Questions - -#----------------------------------------# -Question 1 -Level 1 - -Question: -Write a program which will find all such numbers which are divisible by 7 but are not a multiple of 5, -between 2000 and 3200 (both included). -The numbers obtained should be printed in a comma-separated sequence on a single line. - -Hints: -Consider use range(#begin, #end) method - -Solution: -l=[] -for i in range(2000, 3201): - if (i%7==0) and (i%5!=0): - l.append(str(i)) - -print ','.join(l) -#----------------------------------------# - -#----------------------------------------# -Question 2 -Level 1 - -Question: -Write a program which can compute the factorial of a given numbers. -The results should be printed in a comma-separated sequence on a single line. -Suppose the following input is supplied to the program: -8 -Then, the output should be: -40320 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -def fact(x): - if x == 0: - return 1 - return x * fact(x - 1) - -x=int(raw_input()) -print fact(x) -#----------------------------------------# - -#----------------------------------------# -Question 3 -Level 1 - -Question: -With a given integral number n, write a program to generate a dictionary that contains (i, i*i) such that is an integral number between 1 and n (both included). and then the program should print the dictionary. -Suppose the following input is supplied to the program: -8 -Then, the output should be: -{1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64} - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. -Consider use dict() - -Solution: -n=int(raw_input()) -d=dict() -for i in range(1,n+1): - d[i]=i*i - -print d -#----------------------------------------# - -#----------------------------------------# -Question 4 -Level 1 - -Question: -Write a program which accepts a sequence of comma-separated numbers from console and generate a list and a tuple which contains every number. -Suppose the following input is supplied to the program: -34,67,55,33,12,98 -Then, the output should be: -['34', '67', '55', '33', '12', '98'] -('34', '67', '55', '33', '12', '98') - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. -tuple() method can convert list to tuple - -Solution: -values=raw_input() -l=values.split(",") -t=tuple(l) -print l -print t -#----------------------------------------# - -#----------------------------------------# -Question 5 -Level 1 - -Question: -Define a class which has at least two methods: -getString: to get a string from console input -printString: to print the string in upper case. -Also please include simple test function to test the class methods. - -Hints: -Use __init__ method to construct some parameters - -Solution: -class InputOutString(object): - def __init__(self): - self.s = "" - - def getString(self): - self.s = raw_input() - - def printString(self): - print self.s.upper() - -strObj = InputOutString() -strObj.getString() -strObj.printString() -#----------------------------------------# - -#----------------------------------------# -Question 6 -Level 2 - -Question: -Write a program that calculates and prints the value according to the given formula: -Q = Square root of [(2 * C * D)/H] -Following are the fixed values of C and H: -C is 50. H is 30. -D is the variable whose values should be input to your program in a comma-separated sequence. -Example -Let us assume the following comma separated input sequence is given to the program: -100,150,180 -The output of the program should be: -18,22,24 - -Hints: -If the output received is in decimal form, it should be rounded off to its nearest value (for example, if the output received is 26.0, it should be printed as 26) -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -#!/usr/bin/env python -import math -c=50 -h=30 -value = [] -items=[x for x in raw_input().split(',')] -for d in items: - value.append(str(int(round(math.sqrt(2*c*float(d)/h))))) - -print ','.join(value) -#----------------------------------------# - -#----------------------------------------# -Question 7 -Level 2 - -Question: -Write a program which takes 2 digits, X,Y as input and generates a 2-dimensional array. The element value in the i-th row and j-th column of the array should be i*j. -Note: i=0,1.., X-1; j=0,1,¡­Y-1. -Example -Suppose the following inputs are given to the program: -3,5 -Then, the output of the program should be: -[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8]] - -Hints: -Note: In case of input data being supplied to the question, it should be assumed to be a console input in a comma-separated form. - -Solution: -input_str = raw_input() -dimensions=[int(x) for x in input_str.split(',')] -rowNum=dimensions[0] -colNum=dimensions[1] -multilist = [[0 for col in range(colNum)] for row in range(rowNum)] - -for row in range(rowNum): - for col in range(colNum): - multilist[row][col]= row*col - -print multilist -#----------------------------------------# - -#----------------------------------------# -Question 8 -Level 2 - -Question: -Write a program that accepts a comma separated sequence of words as input and prints the words in a comma-separated sequence after sorting them alphabetically. -Suppose the following input is supplied to the program: -without,hello,bag,world -Then, the output should be: -bag,hello,without,world - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -items=[x for x in raw_input().split(',')] -items.sort() -print ','.join(items) -#----------------------------------------# - -#----------------------------------------# -Question 9 -Level 2 - -Question£º -Write a program that accepts sequence of lines as input and prints the lines after making all characters in the sentence capitalized. -Suppose the following input is supplied to the program: -Hello world -Practice makes perfect -Then, the output should be: -HELLO WORLD -PRACTICE MAKES PERFECT - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -lines = [] -while True: - s = raw_input() - if s: - lines.append(s.upper()) - else: - break; - -for sentence in lines: - print sentence -#----------------------------------------# - -#----------------------------------------# -Question 10 -Level 2 - -Question: -Write a program that accepts a sequence of whitespace separated words as input and prints the words after removing all duplicate words and sorting them alphanumerically. -Suppose the following input is supplied to the program: -hello world and practice makes perfect and hello world again -Then, the output should be: -again and hello makes perfect practice world - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. -We use set container to remove duplicated data automatically and then use sorted() to sort the data. - -Solution: -s = raw_input() -words = [word for word in s.split(" ")] -print " ".join(sorted(list(set(words)))) -#----------------------------------------# - -#----------------------------------------# -Question 11 -Level 2 - -Question: -Write a program which accepts a sequence of comma separated 4 digit binary numbers as its input and then check whether they are divisible by 5 or not. The numbers that are divisible by 5 are to be printed in a comma separated sequence. -Example: -0100,0011,1010,1001 -Then the output should be: -1010 -Notes: Assume the data is input by console. - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -value = [] -items=[x for x in raw_input().split(',')] -for p in items: - intp = int(p, 2) - if not intp%5: - value.append(p) - -print ','.join(value) -#----------------------------------------# - -#----------------------------------------# -Question 12 -Level 2 - -Question: -Write a program, which will find all such numbers between 1000 and 3000 (both included) such that each digit of the number is an even number. -The numbers obtained should be printed in a comma-separated sequence on a single line. - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -values = [] -for i in range(1000, 3001): - s = str(i) - if (int(s[0])%2==0) and (int(s[1])%2==0) and (int(s[2])%2==0) and (int(s[3])%2==0): - values.append(s) -print ",".join(values) -#----------------------------------------# - -#----------------------------------------# -Question 13 -Level 2 - -Question: -Write a program that accepts a sentence and calculate the number of letters and digits. -Suppose the following input is supplied to the program: -hello world! 123 -Then, the output should be: -LETTERS 10 -DIGITS 3 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -s = raw_input() -d={"DIGITS":0, "LETTERS":0} -for c in s: - if c.isdigit(): - d["DIGITS"]+=1 - elif c.isalpha(): - d["LETTERS"]+=1 - else: - pass -print "LETTERS", d["LETTERS"] -print "DIGITS", d["DIGITS"] -#----------------------------------------# - -#----------------------------------------# -Question 14 -Level 2 - -Question: -Write a program that accepts a sentence and calculate the number of upper case letters and lower case letters. -Suppose the following input is supplied to the program: -Hello world! -Then, the output should be: -UPPER CASE 1 -LOWER CASE 9 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -s = raw_input() -d={"UPPER CASE":0, "LOWER CASE":0} -for c in s: - if c.isupper(): - d["UPPER CASE"]+=1 - elif c.islower(): - d["LOWER CASE"]+=1 - else: - pass -print "UPPER CASE", d["UPPER CASE"] -print "LOWER CASE", d["LOWER CASE"] -#----------------------------------------# - -#----------------------------------------# -Question 15 -Level 2 - -Question: -Write a program that computes the value of a+aa+aaa+aaaa with a given digit as the value of a. -Suppose the following input is supplied to the program: -9 -Then, the output should be: -11106 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -a = raw_input() -n1 = int( "%s" % a ) -n2 = int( "%s%s" % (a,a) ) -n3 = int( "%s%s%s" % (a,a,a) ) -n4 = int( "%s%s%s%s" % (a,a,a,a) ) -print n1+n2+n3+n4 -#----------------------------------------# - -#----------------------------------------# -Question 16 -Level 2 - -Question: -Use a list comprehension to square each odd number in a list. The list is input by a sequence of comma-separated numbers. -Suppose the following input is supplied to the program: -1,2,3,4,5,6,7,8,9 -Then, the output should be: -1,3,5,7,9 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -values = raw_input() -numbers = [x for x in values.split(",") if int(x)%2!=0] -print ",".join(numbers) -#----------------------------------------# - -Question 17 -Level 2 - -Question: -Write a program that computes the net amount of a bank account based a transaction log from console input. The transaction log format is shown as following: -D 100 -W 200 -¡­ -D means deposit while W means withdrawal. -Suppose the following input is supplied to the program: -D 300 -D 300 -W 200 -D 100 -Then, the output should be: -500 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: -import sys -netAmount = 0 -while True: - s = raw_input() - if not s: - break - values = s.split(" ") - operation = values[0] - amount = int(values[1]) - if operation=="D": - netAmount+=amount - elif operation=="W": - netAmount-=amount - else: - pass -print netAmount -#----------------------------------------# - -#----------------------------------------# -Question 18 -Level 3 - -Question: -A website requires the users to input username and password to register. Write a program to check the validity of password input by users. -Following are the criteria for checking the password: -1. At least 1 letter between [a-z] -2. At least 1 number between [0-9] -1. At least 1 letter between [A-Z] -3. At least 1 character from [$#@] -4. Minimum length of transaction password: 6 -5. Maximum length of transaction password: 12 -Your program should accept a sequence of comma separated passwords and will check them according to the above criteria. Passwords that match the criteria are to be printed, each separated by a comma. -Example -If the following passwords are given as input to the program: -ABd1234@1,a F1#,2w3E*,2We3345 -Then, the output of the program should be: -ABd1234@1 - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solutions: -import re -value = [] -items=[x for x in raw_input().split(',')] -for p in items: - if len(p)<6 or len(p)>12: - continue - else: - pass - if not re.search("[a-z]",p): - continue - elif not re.search("[0-9]",p): - continue - elif not re.search("[A-Z]",p): - continue - elif not re.search("[$#@]",p): - continue - elif re.search("\s",p): - continue - else: - pass - value.append(p) -print ",".join(value) -#----------------------------------------# - -#----------------------------------------# -Question 19 -Level 3 - -Question: -You are required to write a program to sort the (name, age, height) tuples by ascending order where name is string, age and height are numbers. The tuples are input by console. The sort criteria is: -1: Sort based on name; -2: Then sort based on age; -3: Then sort by score. -The priority is that name > age > score. -If the following tuples are given as input to the program: -Tom,19,80 -John,20,90 -Jony,17,91 -Jony,17,93 -Json,21,85 -Then, the output of the program should be: -[('John', '20', '90'), ('Jony', '17', '91'), ('Jony', '17', '93'), ('Json', '21', '85'), ('Tom', '19', '80')] - -Hints: -In case of input data being supplied to the question, it should be assumed to be a console input. -We use itemgetter to enable multiple sort keys. - -Solutions: -from operator import itemgetter, attrgetter - -l = [] -while True: - s = raw_input() - if not s: - break - l.append(tuple(s.split(","))) - -print sorted(l, key=itemgetter(0,1,2)) -#----------------------------------------# - -#----------------------------------------# -Question 20 -Level 3 - -Question: -Define a class with a generator which can iterate the numbers, which are divisible by 7, between a given range 0 and n. - -Hints: -Consider use yield - -Solution: -def putNumbers(n): - i = 0 - while ilen2: - print s1 - elif len2>len1: - print s2 - else: - print s1 - print s2 - - -printValue("one","three") - - - -#----------------------------------------# -2.10 - -Question: -Define a function that can accept an integer number as input and print the "It is an even number" if the number is even, otherwise print "It is an odd number". - -Hints: - -Use % operator to check if a number is even or odd. - -Solution -def checkValue(n): - if n%2 == 0: - print "It is an even number" - else: - print "It is an odd number" - - -checkValue(7) - - -#----------------------------------------# -2.10 - -Question: -Define a function which can print a dictionary where the keys are numbers between 1 and 3 (both included) and the values are square of keys. - -Hints: - -Use dict[key]=value pattern to put entry into a dictionary. -Use ** operator to get power of a number. - -Solution -def printDict(): - d=dict() - d[1]=1 - d[2]=2**2 - d[3]=3**2 - print d - - -printDict() - - - - - -#----------------------------------------# -2.10 - -Question: -Define a function which can print a dictionary where the keys are numbers between 1 and 20 (both included) and the values are square of keys. - -Hints: - -Use dict[key]=value pattern to put entry into a dictionary. -Use ** operator to get power of a number. -Use range() for loops. - -Solution -def printDict(): - d=dict() - for i in range(1,21): - d[i]=i**2 - print d - - -printDict() - - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate a dictionary where the keys are numbers between 1 and 20 (both included) and the values are square of keys. The function should just print the values only. - -Hints: - -Use dict[key]=value pattern to put entry into a dictionary. -Use ** operator to get power of a number. -Use range() for loops. -Use keys() to iterate keys in the dictionary. Also we can use item() to get key/value pairs. - -Solution -def printDict(): - d=dict() - for i in range(1,21): - d[i]=i**2 - for (k,v) in d.items(): - print v - - -printDict() - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate a dictionary where the keys are numbers between 1 and 20 (both included) and the values are square of keys. The function should just print the keys only. - -Hints: - -Use dict[key]=value pattern to put entry into a dictionary. -Use ** operator to get power of a number. -Use range() for loops. -Use keys() to iterate keys in the dictionary. Also we can use item() to get key/value pairs. - -Solution -def printDict(): - d=dict() - for i in range(1,21): - d[i]=i**2 - for k in d.keys(): - print k - - -printDict() - - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate and print a list where the values are square of numbers between 1 and 20 (both included). - -Hints: - -Use ** operator to get power of a number. -Use range() for loops. -Use list.append() to add values into a list. - -Solution -def printList(): - li=list() - for i in range(1,21): - li.append(i**2) - print li - - -printList() - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate a list where the values are square of numbers between 1 and 20 (both included). Then the function needs to print the first 5 elements in the list. - -Hints: - -Use ** operator to get power of a number. -Use range() for loops. -Use list.append() to add values into a list. -Use [n1:n2] to slice a list - -Solution -def printList(): - li=list() - for i in range(1,21): - li.append(i**2) - print li[:5] - - -printList() - - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate a list where the values are square of numbers between 1 and 20 (both included). Then the function needs to print the last 5 elements in the list. - -Hints: - -Use ** operator to get power of a number. -Use range() for loops. -Use list.append() to add values into a list. -Use [n1:n2] to slice a list - -Solution -def printList(): - li=list() - for i in range(1,21): - li.append(i**2) - print li[-5:] - - -printList() - - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate a list where the values are square of numbers between 1 and 20 (both included). Then the function needs to print all values except the first 5 elements in the list. - -Hints: - -Use ** operator to get power of a number. -Use range() for loops. -Use list.append() to add values into a list. -Use [n1:n2] to slice a list - -Solution -def printList(): - li=list() - for i in range(1,21): - li.append(i**2) - print li[5:] - - -printList() - - -#----------------------------------------# -2.10 - -Question: -Define a function which can generate and print a tuple where the value are square of numbers between 1 and 20 (both included). - -Hints: - -Use ** operator to get power of a number. -Use range() for loops. -Use list.append() to add values into a list. -Use tuple() to get a tuple from a list. - -Solution -def printTuple(): - li=list() - for i in range(1,21): - li.append(i**2) - print tuple(li) - -printTuple() - - - -#----------------------------------------# -2.10 - -Question: -With a given tuple (1,2,3,4,5,6,7,8,9,10), write a program to print the first half values in one line and the last half values in one line. - -Hints: - -Use [n1:n2] notation to get a slice from a tuple. - -Solution -tp=(1,2,3,4,5,6,7,8,9,10) -tp1=tp[:5] -tp2=tp[5:] -print tp1 -print tp2 - - -#----------------------------------------# -2.10 - -Question: -Write a program to generate and print another tuple whose values are even numbers in the given tuple (1,2,3,4,5,6,7,8,9,10). - -Hints: - -Use "for" to iterate the tuple -Use tuple() to generate a tuple from a list. - -Solution -tp=(1,2,3,4,5,6,7,8,9,10) -li=list() -for i in tp: - if tp[i]%2==0: - li.append(tp[i]) - -tp2=tuple(li) -print tp2 - - - -#----------------------------------------# -2.14 - -Question: -Write a program which accepts a string as input to print "Yes" if the string is "yes" or "YES" or "Yes", otherwise print "No". - -Hints: - -Use if statement to judge condition. - -Solution -s= raw_input() -if s=="yes" or s=="YES" or s=="Yes": - print "Yes" -else: - print "No" - - - -#----------------------------------------# -3.4 - -Question: -Write a program which can filter even numbers in a list by using filter function. The list is: [1,2,3,4,5,6,7,8,9,10]. - -Hints: - -Use filter() to filter some elements in a list. -Use lambda to define anonymous functions. - -Solution -li = [1,2,3,4,5,6,7,8,9,10] -evenNumbers = filter(lambda x: x%2==0, li) -print evenNumbers - - -#----------------------------------------# -3.4 - -Question: -Write a program which can map() to make a list whose elements are square of elements in [1,2,3,4,5,6,7,8,9,10]. - -Hints: - -Use map() to generate a list. -Use lambda to define anonymous functions. - -Solution -li = [1,2,3,4,5,6,7,8,9,10] -squaredNumbers = map(lambda x: x**2, li) -print squaredNumbers - -#----------------------------------------# -3.5 - -Question: -Write a program which can map() and filter() to make a list whose elements are square of even number in [1,2,3,4,5,6,7,8,9,10]. - -Hints: - -Use map() to generate a list. -Use filter() to filter elements of a list. -Use lambda to define anonymous functions. - -Solution -li = [1,2,3,4,5,6,7,8,9,10] -evenNumbers = map(lambda x: x**2, filter(lambda x: x%2==0, li)) -print evenNumbers - - - - -#----------------------------------------# -3.5 - -Question: -Write a program which can filter() to make a list whose elements are even number between 1 and 20 (both included). - -Hints: - -Use filter() to filter elements of a list. -Use lambda to define anonymous functions. - -Solution -evenNumbers = filter(lambda x: x%2==0, range(1,21)) -print evenNumbers - - -#----------------------------------------# -3.5 - -Question: -Write a program which can map() to make a list whose elements are square of numbers between 1 and 20 (both included). - -Hints: - -Use map() to generate a list. -Use lambda to define anonymous functions. - -Solution -squaredNumbers = map(lambda x: x**2, range(1,21)) -print squaredNumbers - - - - -#----------------------------------------# -7.2 - -Question: -Define a class named American which has a static method called printNationality. - -Hints: - -Use @staticmethod decorator to define class static method. - -Solution -class American(object): - @staticmethod - def printNationality(): - print "America" - -anAmerican = American() -anAmerican.printNationality() -American.printNationality() - - - - -#----------------------------------------# - -7.2 - -Question: -Define a class named American and its subclass NewYorker. - -Hints: - -Use class Subclass(ParentClass) to define a subclass. - -Solution: - -class American(object): - pass - -class NewYorker(American): - pass - -anAmerican = American() -aNewYorker = NewYorker() -print anAmerican -print aNewYorker - - - - -#----------------------------------------# - - -7.2 - -Question: -Define a class named Circle which can be constructed by a radius. The Circle class has a method which can compute the area. - -Hints: - -Use def methodName(self) to define a method. - -Solution: - -class Circle(object): - def __init__(self, r): - self.radius = r - - def area(self): - return self.radius**2*3.14 - -aCircle = Circle(2) -print aCircle.area() - - - - - - -#----------------------------------------# - -7.2 - -Define a class named Rectangle which can be constructed by a length and width. The Rectangle class has a method which can compute the area. - -Hints: - -Use def methodName(self) to define a method. - -Solution: - -class Rectangle(object): - def __init__(self, l, w): - self.length = l - self.width = w - - def area(self): - return self.length*self.width - -aRectangle = Rectangle(2,10) -print aRectangle.area() - - - - -#----------------------------------------# - -7.2 - -Define a class named Shape and its subclass Square. The Square class has an init function which takes a length as argument. Both classes have a area function which can print the area of the shape where Shape's area is 0 by default. - -Hints: - -To override a method in super class, we can define a method with the same name in the super class. - -Solution: - -class Shape(object): - def __init__(self): - pass - - def area(self): - return 0 - -class Square(Shape): - def __init__(self, l): - Shape.__init__(self) - self.length = l - - def area(self): - return self.length*self.length - -aSquare= Square(3) -print aSquare.area() - - - - - - - - -#----------------------------------------# - - -Please raise a RuntimeError exception. - -Hints: - -Use raise() to raise an exception. - -Solution: - -raise RuntimeError('something wrong') - - - -#----------------------------------------# -Write a function to compute 5/0 and use try/except to catch the exceptions. - -Hints: - -Use try/except to catch exceptions. - -Solution: - -def throws(): - return 5/0 - -try: - throws() -except ZeroDivisionError: - print "division by zero!" -except Exception, err: - print 'Caught an exception' -finally: - print 'In finally block for cleanup' - - -#----------------------------------------# -Define a custom exception class which takes a string message as attribute. - -Hints: - -To define a custom exception, we need to define a class inherited from Exception. - -Solution: - -class MyError(Exception): - """My own exception class - - Attributes: - msg -- explanation of the error - """ - - def __init__(self, msg): - self.msg = msg - -error = MyError("something wrong") - -#----------------------------------------# -Question: - -Assuming that we have some email addresses in the "username@companyname.com" format, please write program to print the user name of a given email address. Both user names and company names are composed of letters only. - -Example: -If the following email address is given as input to the program: - -john@google.com - -Then, the output of the program should be: - -john - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: - -Use \w to match letters. - -Solution: -import re -emailAddress = raw_input() -pat2 = "(\w+)@((\w+\.)+(com))" -r2 = re.match(pat2,emailAddress) -print r2.group(1) - - -#----------------------------------------# -Question: - -Assuming that we have some email addresses in the "username@companyname.com" format, please write program to print the company name of a given email address. Both user names and company names are composed of letters only. - -Example: -If the following email address is given as input to the program: - -john@google.com - -Then, the output of the program should be: - -google - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: - -Use \w to match letters. - -Solution: -import re -emailAddress = raw_input() -pat2 = "(\w+)@(\w+)\.(com)" -r2 = re.match(pat2,emailAddress) -print r2.group(2) - - - - -#----------------------------------------# -Question: - -Write a program which accepts a sequence of words separated by whitespace as input to print the words composed of digits only. - -Example: -If the following words is given as input to the program: - -2 cats and 3 dogs. - -Then, the output of the program should be: - -['2', '3'] - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: - -Use re.findall() to find all substring using regex. - -Solution: -import re -s = raw_input() -print re.findall("\d+",s) - - -#----------------------------------------# -Question: - - -Print a unicode string "hello world". - -Hints: - -Use u'strings' format to define unicode string. - -Solution: - -unicodeString = u"hello world!" -print unicodeString - -#----------------------------------------# -Write a program to read an ASCII string and to convert it to a unicode string encoded by utf-8. - -Hints: - -Use unicode() function to convert. - -Solution: - -s = raw_input() -u = unicode( s ,"utf-8") -print u - -#----------------------------------------# -Question: - -Write a special comment to indicate a Python source code file is in unicode. - -Hints: - -Solution: - -# -*- coding: utf-8 -*- - -#----------------------------------------# -Question: - -Write a program to compute 1/2+2/3+3/4+...+n/n+1 with a given n input by console (n>0). - -Example: -If the following n is given as input to the program: - -5 - -Then, the output of the program should be: - -3.55 - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: -Use float() to convert an integer to a float - -Solution: - -n=int(raw_input()) -sum=0.0 -for i in range(1,n+1): - sum += float(float(i)/(i+1)) -print sum - - -#----------------------------------------# -Question: - -Write a program to compute: - -f(n)=f(n-1)+100 when n>0 -and f(0)=1 - -with a given n input by console (n>0). - -Example: -If the following n is given as input to the program: - -5 - -Then, the output of the program should be: - -500 - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: -We can define recursive function in Python. - -Solution: - -def f(n): - if n==0: - return 0 - else: - return f(n-1)+100 - -n=int(raw_input()) -print f(n) - -#----------------------------------------# - -Question: - - -The Fibonacci Sequence is computed based on the following formula: - - -f(n)=0 if n=0 -f(n)=1 if n=1 -f(n)=f(n-1)+f(n-2) if n>1 - -Please write a program to compute the value of f(n) with a given n input by console. - -Example: -If the following n is given as input to the program: - -7 - -Then, the output of the program should be: - -13 - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Hints: -We can define recursive function in Python. - - -Solution: - -def f(n): - if n == 0: return 0 - elif n == 1: return 1 - else: return f(n-1)+f(n-2) - -n=int(raw_input()) -print f(n) - - -#----------------------------------------# - -#----------------------------------------# - -Question: - -The Fibonacci Sequence is computed based on the following formula: - - -f(n)=0 if n=0 -f(n)=1 if n=1 -f(n)=f(n-1)+f(n-2) if n>1 - -Please write a program using list comprehension to print the Fibonacci Sequence in comma separated form with a given n input by console. - -Example: -If the following n is given as input to the program: - -7 - -Then, the output of the program should be: - -0,1,1,2,3,5,8,13 - - -Hints: -We can define recursive function in Python. -Use list comprehension to generate a list from an existing list. -Use string.join() to join a list of strings. - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: - -def f(n): - if n == 0: return 0 - elif n == 1: return 1 - else: return f(n-1)+f(n-2) - -n=int(raw_input()) -values = [str(f(x)) for x in range(0, n+1)] -print ",".join(values) - - -#----------------------------------------# - -Question: - -Please write a program using generator to print the even numbers between 0 and n in comma separated form while n is input by console. - -Example: -If the following n is given as input to the program: - -10 - -Then, the output of the program should be: - -0,2,4,6,8,10 - -Hints: -Use yield to produce the next value in generator. - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: - -def EvenGenerator(n): - i=0 - while i<=n: - if i%2==0: - yield i - i+=1 - - -n=int(raw_input()) -values = [] -for i in EvenGenerator(n): - values.append(str(i)) - -print ",".join(values) - - -#----------------------------------------# - -Question: - -Please write a program using generator to print the numbers which can be divisible by 5 and 7 between 0 and n in comma separated form while n is input by console. - -Example: -If the following n is given as input to the program: - -100 - -Then, the output of the program should be: - -0,35,70 - -Hints: -Use yield to produce the next value in generator. - -In case of input data being supplied to the question, it should be assumed to be a console input. - -Solution: - -def NumGenerator(n): - for i in range(n+1): - if i%5==0 and i%7==0: - yield i - -n=int(raw_input()) -values = [] -for i in NumGenerator(n): - values.append(str(i)) - -print ",".join(values) - - -#----------------------------------------# - -Question: - - -Please write assert statements to verify that every number in the list [2,4,6,8] is even. - - - -Hints: -Use "assert expression" to make assertion. - - -Solution: - -li = [2,4,6,8] -for i in li: - assert i%2==0 - - -#----------------------------------------# -Question: - -Please write a program which accepts basic mathematic expression from console and print the evaluation result. - -Example: -If the following string is given as input to the program: - -35+3 - -Then, the output of the program should be: - -38 - -Hints: -Use eval() to evaluate an expression. - - -Solution: - -expression = raw_input() -print eval(expression) - - -#----------------------------------------# -Question: - -Please write a binary search function which searches an item in a sorted list. The function should return the index of element to be searched in the list. - - -Hints: -Use if/elif to deal with conditions. - - -Solution: - -import math -def bin_search(li, element): - bottom = 0 - top = len(li)-1 - index = -1 - while top>=bottom and index==-1: - mid = int(math.floor((top+bottom)/2.0)) - if li[mid]==element: - index = mid - elif li[mid]>element: - top = mid-1 - else: - bottom = mid+1 - - return index - -li=[2,5,7,9,11,17,222] -print bin_search(li,11) -print bin_search(li,12) - - - - -#----------------------------------------# -Question: - -Please write a binary search function which searches an item in a sorted list. The function should return the index of element to be searched in the list. - - -Hints: -Use if/elif to deal with conditions. - - -Solution: - -import math -def bin_search(li, element): - bottom = 0 - top = len(li)-1 - index = -1 - while top>=bottom and index==-1: - mid = int(math.floor((top+bottom)/2.0)) - if li[mid]==element: - index = mid - elif li[mid]>element: - top = mid-1 - else: - bottom = mid+1 - - return index - -li=[2,5,7,9,11,17,222] -print bin_search(li,11) -print bin_search(li,12) - - - - -#----------------------------------------# -Question: - -Please generate a random float where the value is between 10 and 100 using Python math module. - - - -Hints: -Use random.random() to generate a random float in [0,1]. - - -Solution: - -import random -print random.random()*100 - -#----------------------------------------# -Question: - -Please generate a random float where the value is between 5 and 95 using Python math module. - - - -Hints: -Use random.random() to generate a random float in [0,1]. - - -Solution: - -import random -print random.random()*100-5 - - -#----------------------------------------# -Question: - -Please write a program to output a random even number between 0 and 10 inclusive using random module and list comprehension. - - - -Hints: -Use random.choice() to a random element from a list. - - -Solution: - -import random -print random.choice([i for i in range(11) if i%2==0]) - - -#----------------------------------------# -Question: - -Please write a program to output a random number, which is divisible by 5 and 7, between 0 and 10 inclusive using random module and list comprehension. - - - -Hints: -Use random.choice() to a random element from a list. - - -Solution: - -import random -print random.choice([i for i in range(201) if i%5==0 and i%7==0]) - - - -#----------------------------------------# - -Question: - -Please write a program to generate a list with 5 random numbers between 100 and 200 inclusive. - - - -Hints: -Use random.sample() to generate a list of random values. - - -Solution: - -import random -print random.sample(range(100), 5) - -#----------------------------------------# -Question: - -Please write a program to randomly generate a list with 5 even numbers between 100 and 200 inclusive. - - - -Hints: -Use random.sample() to generate a list of random values. - - -Solution: - -import random -print random.sample([i for i in range(100,201) if i%2==0], 5) - - -#----------------------------------------# -Question: - -Please write a program to randomly generate a list with 5 numbers, which are divisible by 5 and 7 , between 1 and 1000 inclusive. - - - -Hints: -Use random.sample() to generate a list of random values. - - -Solution: - -import random -print random.sample([i for i in range(1,1001) if i%5==0 and i%7==0], 5) - -#----------------------------------------# - -Question: - -Please write a program to randomly print a integer number between 7 and 15 inclusive. - - - -Hints: -Use random.randrange() to a random integer in a given range. - - -Solution: - -import random -print random.randrange(7,16) - -#----------------------------------------# - -Question: - -Please write a program to compress and decompress the string "hello world!hello world!hello world!hello world!". - - - -Hints: -Use zlib.compress() and zlib.decompress() to compress and decompress a string. - - -Solution: - -import zlib -s = 'hello world!hello world!hello world!hello world!' -t = zlib.compress(s) -print t -print zlib.decompress(t) - -#----------------------------------------# -Question: - -Please write a program to print the running time of execution of "1+1" for 100 times. - - - -Hints: -Use timeit() function to measure the running time. - -Solution: - -from timeit import Timer -t = Timer("for i in range(100):1+1") -print t.timeit() - -#----------------------------------------# -Question: - -Please write a program to shuffle and print the list [3,6,7,8]. - - - -Hints: -Use shuffle() function to shuffle a list. - -Solution: - -from random import shuffle -li = [3,6,7,8] -shuffle(li) -print li - -#----------------------------------------# -Question: - -Please write a program to shuffle and print the list [3,6,7,8]. - - - -Hints: -Use shuffle() function to shuffle a list. - -Solution: - -from random import shuffle -li = [3,6,7,8] -shuffle(li) -print li - - - -#----------------------------------------# -Question: - -Please write a program to generate all sentences where subject is in ["I", "You"] and verb is in ["Play", "Love"] and the object is in ["Hockey","Football"]. - -Hints: -Use list[index] notation to get a element from a list. - -Solution: - -subjects=["I", "You"] -verbs=["Play", "Love"] -objects=["Hockey","Football"] -for i in range(len(subjects)): - for j in range(len(verbs)): - for k in range(len(objects)): - sentence = "%s %s %s." % (subjects[i], verbs[j], objects[k]) - print sentence - - -#----------------------------------------# -Please write a program to print the list after removing delete even numbers in [5,6,77,45,22,12,24]. - -Hints: -Use list comprehension to delete a bunch of element from a list. - -Solution: - -li = [5,6,77,45,22,12,24] -li = [x for x in li if x%2!=0] -print li - -#----------------------------------------# -Question: - -By using list comprehension, please write a program to print the list after removing delete numbers which are divisible by 5 and 7 in [12,24,35,70,88,120,155]. - -Hints: -Use list comprehension to delete a bunch of element from a list. - -Solution: - -li = [12,24,35,70,88,120,155] -li = [x for x in li if x%5!=0 and x%7!=0] -print li - - -#----------------------------------------# -Question: - -By using list comprehension, please write a program to print the list after removing the 0th, 2nd, 4th,6th numbers in [12,24,35,70,88,120,155]. - -Hints: -Use list comprehension to delete a bunch of element from a list. -Use enumerate() to get (index, value) tuple. - -Solution: - -li = [12,24,35,70,88,120,155] -li = [x for (i,x) in enumerate(li) if i%2!=0] -print li - -#----------------------------------------# - -Question: - -By using list comprehension, please write a program generate a 3*5*8 3D array whose each element is 0. - -Hints: -Use list comprehension to make an array. - -Solution: - -array = [[ [0 for col in range(8)] for col in range(5)] for row in range(3)] -print array - -#----------------------------------------# -Question: - -By using list comprehension, please write a program to print the list after removing the 0th,4th,5th numbers in [12,24,35,70,88,120,155]. - -Hints: -Use list comprehension to delete a bunch of element from a list. -Use enumerate() to get (index, value) tuple. - -Solution: - -li = [12,24,35,70,88,120,155] -li = [x for (i,x) in enumerate(li) if i not in (0,4,5)] -print li - - - -#----------------------------------------# - -Question: - -By using list comprehension, please write a program to print the list after removing the value 24 in [12,24,35,24,88,120,155]. - -Hints: -Use list's remove method to delete a value. - -Solution: - -li = [12,24,35,24,88,120,155] -li = [x for x in li if x!=24] -print li - - -#----------------------------------------# -Question: - -With two given lists [1,3,6,78,35,55] and [12,24,35,24,88,120,155], write a program to make a list whose elements are intersection of the above given lists. - -Hints: -Use set() and "&=" to do set intersection operation. - -Solution: - -set1=set([1,3,6,78,35,55]) -set2=set([12,24,35,24,88,120,155]) -set1 &= set2 -li=list(set1) -print li - -#----------------------------------------# - -With a given list [12,24,35,24,88,120,155,88,120,155], write a program to print this list after removing all duplicate values with original order reserved. - -Hints: -Use set() to store a number of values without duplicate. - -Solution: - -def removeDuplicate( li ): - newli=[] - seen = set() - for item in li: - if item not in seen: - seen.add( item ) - newli.append(item) - - return newli - -li=[12,24,35,24,88,120,155,88,120,155] -print removeDuplicate(li) - - -#----------------------------------------# -Question: - -Define a class Person and its two child classes: Male and Female. All classes have a method "getGender" which can print "Male" for Male class and "Female" for Female class. - -Hints: -Use Subclass(Parentclass) to define a child class. - -Solution: - -class Person(object): - def getGender( self ): - return "Unknown" - -class Male( Person ): - def getGender( self ): - return "Male" - -class Female( Person ): - def getGender( self ): - return "Female" - -aMale = Male() -aFemale= Female() -print aMale.getGender() -print aFemale.getGender() - - - -#----------------------------------------# -Question: - -Please write a program which count and print the numbers of each character in a string input by console. - -Example: -If the following string is given as input to the program: - -abcdefgabc - -Then, the output of the program should be: - -a,2 -c,2 -b,2 -e,1 -d,1 -g,1 -f,1 - -Hints: -Use dict to store key/value pairs. -Use dict.get() method to lookup a key with default value. - -Solution: - -dic = {} -s=raw_input() -for s in s: - dic[s] = dic.get(s,0)+1 -print '\n'.join(['%s,%s' % (k, v) for k, v in dic.items()]) - -#----------------------------------------# - -Question: - -Please write a program which accepts a string from console and print it in reverse order. - -Example: -If the following string is given as input to the program: - -rise to vote sir - -Then, the output of the program should be: - -ris etov ot esir - -Hints: -Use list[::-1] to iterate a list in a reverse order. - -Solution: - -s=raw_input() -s = s[::-1] -print s - -#----------------------------------------# - -Question: - -Please write a program which accepts a string from console and print the characters that have even indexes. - -Example: -If the following string is given as input to the program: - -H1e2l3l4o5w6o7r8l9d - -Then, the output of the program should be: - -Helloworld - -Hints: -Use list[::2] to iterate a list by step 2. - -Solution: - -s=raw_input() -s = s[::2] -print s -#----------------------------------------# - - -Question: - -Please write a program which prints all permutations of [1,2,3] - - -Hints: -Use itertools.permutations() to get permutations of list. - -Solution: - -import itertools -print list(itertools.permutations([1,2,3])) - -#----------------------------------------# -Question: - -Write a program to solve a classic ancient Chinese puzzle: -We count 35 heads and 94 legs among the chickens and rabbits in a farm. How many rabbits and how many chickens do we have? - -Hint: -Use for loop to iterate all possible solutions. - -Solution: - -def solve(numheads,numlegs): - ns='No solutions!' - for i in range(numheads+1): - j=numheads-i - if 2*i+4*j==numlegs: - return i,j - return ns,ns - -numheads=35 -numlegs=94 -solutions=solve(numheads,numlegs) -print solutions - -#----------------------------------------# - - diff --git a/curlpy.py b/curlpy.py new file mode 100644 index 00000000..5a532f34 --- /dev/null +++ b/curlpy.py @@ -0,0 +1,21 @@ +import urllib, urllib2, cookielib + +#cookie storage +cj = cookielib.CookieJar() +#create an opener +opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) +#Add useragent, sites don't like to interact programs. +opener.addheaders.append(('User-agent', 'Mozilla/4.0')) +opener.addheaders.append( ('Referer', 'https://www.hscripts.com/tools/mailid-validation/') ) + +login_data = urllib.urlencode({'email' : 'office@mos.org', 'start' : 'go' + }) +#hhhhhhhhh +resp = opener.open('https://www.hscripts.com/tools/mailid-validation/', login_data) +the_page = resp.read() + +file = open('outpu.html','w') +file.write(the_page) +file.close() +resp.close() + diff --git a/d/urbanspoon/scrapy.cfg b/d/urbanspoon/scrapy.cfg new file mode 100644 index 00000000..e0b71b54 --- /dev/null +++ b/d/urbanspoon/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/topics/scrapyd.html + +[settings] +default = urbanspoon.settings + +[deploy] +#url = http://localhost:6800/ +project = urbanspoon diff --git a/README b/d/urbanspoon/urbanspoon/__init__.py similarity index 100% rename from README rename to d/urbanspoon/urbanspoon/__init__.py diff --git a/d/urbanspoon/urbanspoon/items.py b/d/urbanspoon/urbanspoon/items.py new file mode 100644 index 00000000..a871573b --- /dev/null +++ b/d/urbanspoon/urbanspoon/items.py @@ -0,0 +1,22 @@ +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +from scrapy.item import Item, Field + +class UrbanspoonItem(Item): + # define the fields for your item here like: + # name = Field() + b_id = Field() + url = Field() + n_votes = Field() + p_like= Field() + n_reviews= Field() + date= Field() + title= Field() + description= Field() + user = Field() + + + diff --git a/d/urbanspoon/urbanspoon/pipelines.py b/d/urbanspoon/urbanspoon/pipelines.py new file mode 100644 index 00000000..41721f0c --- /dev/null +++ b/d/urbanspoon/urbanspoon/pipelines.py @@ -0,0 +1,8 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/topics/item-pipeline.html + +class UrbanspoonPipeline(object): + def process_item(self, item, spider): + return item diff --git a/d/urbanspoon/urbanspoon/settings.py b/d/urbanspoon/urbanspoon/settings.py new file mode 100644 index 00000000..58b0a53e --- /dev/null +++ b/d/urbanspoon/urbanspoon/settings.py @@ -0,0 +1,15 @@ +# Scrapy settings for Urbanspoon project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/topics/settings.html +# + +BOT_NAME = 'urbanspoon' + +SPIDER_MODULES = ['urbanspoon.spiders'] +NEWSPIDER_MODULE = 'urbanspoon.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'urbanspoon (+http://www.yourdomain.com)' diff --git a/d/urbanspoon/urbanspoon/spiders/__init__.py b/d/urbanspoon/urbanspoon/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/d/urbanspoon/urbanspoon/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/d/urbanspoon/urbanspoon/spiders/urban.py b/d/urbanspoon/urbanspoon/spiders/urban.py new file mode 100644 index 00000000..efdfbe4e --- /dev/null +++ b/d/urbanspoon/urbanspoon/spiders/urban.py @@ -0,0 +1,108 @@ +""" Enter Url here""" +#url =['http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach'] +url =['http://www.urbanspoon.com/r/1/4524/restaurant/Capitol-Hill/Honey-Hole-Sandwiches-Seattle','http://www.urbanspoon.com/r/35/1571097/restaurant/Hampton-Roads/Dam-Neck-Corner-Pungo/Firebrew-Virginia-Beach','http://www.urbanspoon.com/r/13/169913/restaurant/North-Richland-Hills-Richland-Hills/Texs-Star-Grill-Watauga'] +from scrapy.spider import BaseSpider +from urlparse import urljoin +from scrapy.http import Request +from scrapy.selector import HtmlXPathSelector +from urbanspoon.items import UrbanspoonItem +import psycopg2 + + +""" Database Creation""" +try: + con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") + cur = con.cursor() +except: + """ Database not Connected """ + +class UrbanspoonSpider(BaseSpider): + + name = "urban" + +# allowed_domains = ["urbanspoon.com"] + start_urls=[url[i] for i in range(len(url))] + +# filehandle = open('Urbanspoon.csv','w') +# filehandle.write("Manufacturer\tDescription\tPart Number\tUrl\tAvailability\tPrice\tItem Number\tOEM\tMore_items\n") + + def parse(self,response): + + item=UrbanspoonItem() + hxs = HtmlXPathSelector(response) + item['url']= response.url + item['b_id'] = 0 + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct url from social_data.urbanspoon") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] +# print li + + for i in range(len(li)): + # print li[i], item['url'] + if li[i] in item['url']: + qu1 = ("select distinct business_id from social_data.urbanspoon where url='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] +# print li1 + if len(li1) > 0: + item['b_id'] = li1[0] + if item['b_id'] > 0: + print " ID assigned" + else: + item['b_id'] = input("Enter the Business Id here for URL: %s : "%item['url']) + + + print item['b_id'] + + + + item['n_votes'] = hxs.select('//div[@class="stats"]/div/text()').extract()[0].encode('utf-8').replace('\n','').strip() + item['p_like'] = hxs.select('//div[@class="rating"]/text()').extract()[0].encode('utf-8').strip() + item['n_reviews'] = int(hxs.select('//div[@class="stats"]/div/a[@data-ga-action="reviews"]/text()').extract()[0].split(' ')[0].encode('utf-8').strip()) + nxt_link = 'http://www.urbanspoon.com'+hxs.select('//div[@data-ga-action="diner-reviews"]/@data-url').extract()[0].encode('utf-8').strip() + print nxt_link + if nxt_link: + yield Request(nxt_link, callback=self.parse_sub, meta=dict(item=item)) + + def parse_sub(self,response): + + + print "Sub Parse Called" + item = response.meta.get('item') + hxs = HtmlXPathSelector(response) + + x=hxs.select('//ul/li[@class="comment review"]') + length=len(x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract()) + + for i in range(length): + + try: + item['date'] = x[0].select('//div[@class="details"]/div/time[@class="posted-on"]/text()').extract()[i].encode('utf-8').split(' ')[2].replace('\n','') + + except: + item['date'] = '' + try: + item['title'] = x[0].select('//div[@class="details"]/div[@class="title"]/text()').extract()[i].encode('utf-8').replace("'","").strip() + except: + item['title'] = '' + + try: + item['description'] = x[0].select('//div[@class="details"]/div[@itemprop="description"]').extract()[i].encode('utf-8').split('\n')[1].replace("'","") + except: + item['description'] = '' + try: + item['user'] = x[0].select('//div[@class="details"]/div[@class="byline"]/a[@itemprop="reviewer"]/text()').extract()[i].encode('utf-8').replace("'","").strip() + except: + item['user'] = '' + print item['user'] + sql = ("insert into social_data.urbanspoon select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.urbanspoon where review_description='%s' and review_user='%s')"%(item['b_id'],item['url'],item['n_votes'], item['p_like'], item['n_reviews'],item['date'],item['title'],item['description'], item['user'],item['description'], item['user'])) + cur.execute(sql) + con.commit() + + + diff --git a/d/yelp/scrapy.cfg b/d/yelp/scrapy.cfg new file mode 100644 index 00000000..74ee695b --- /dev/null +++ b/d/yelp/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# http://doc.scrapy.org/topics/scrapyd.html + +[settings] +default = yelp.settings + +[deploy] +#url = http://localhost:6800/ +project = yelp diff --git a/d/yelp/yelp/__init__.py b/d/yelp/yelp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/d/yelp/yelp/pipelines.py b/d/yelp/yelp/pipelines.py new file mode 100644 index 00000000..86f7a6ca --- /dev/null +++ b/d/yelp/yelp/pipelines.py @@ -0,0 +1,8 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/topics/item-pipeline.html + +class YelpPipeline(object): + def process_item(self, item, spider): + return item diff --git a/d/yelp/yelp/settings.py b/d/yelp/yelp/settings.py new file mode 100644 index 00000000..b964190a --- /dev/null +++ b/d/yelp/yelp/settings.py @@ -0,0 +1,15 @@ +# Scrapy settings for yelp project +# +# For simplicity, this file contains only the most important settings by +# default. All the other settings are documented here: +# +# http://doc.scrapy.org/topics/settings.html +# + +BOT_NAME = 'yelp' + +SPIDER_MODULES = ['yelp.spiders'] +NEWSPIDER_MODULE = 'yelp.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'yelp (+http://www.yourdomain.com)' diff --git a/d/yelp/yelp/spiders/__init__.py b/d/yelp/yelp/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/d/yelp/yelp/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/d/yelp/yelp/spiders/yelp_spider.py b/d/yelp/yelp/spiders/yelp_spider.py new file mode 100644 index 00000000..066962a7 --- /dev/null +++ b/d/yelp/yelp/spiders/yelp_spider.py @@ -0,0 +1,138 @@ +" Enter Url here" + +url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/taboon-grand-blanc-grand-blanc-township','http://www.yelp.com/biz/lincoln-washington','http://www.yelp.com/biz/zingermans-delicatessen-ann-arbor-2'] +#url =['http://www.yelp.com/biz/firebrew-virginia-beach'] + +from scrapy.spider import BaseSpider +from scrapy.http import Request +from urlparse import urljoin +from scrapy.selector import HtmlXPathSelector +from yelp.items import YelpItem +import psycopg2 + + + +""" Database Creation""" +try: + con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") + cur = con.cursor() +except: + """ Database not Connected """ + +class YelpSpider(BaseSpider): + + + name = "ylpp" + start_urls=[url[i] for i in range(len(url))] +# f=open('opt.csv','w') + count = 0 + def parse(self, response): + + itm=[] + hxs = HtmlXPathSelector(response) + item = YelpItem() + item['bid'] = 0 + item['url'] = response.url + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct url from social_data.yelp") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] +# print li + for i in range(len(li)): + # print li[i], item['url'] + if li[i] in item['url']: + qu1 = ("select distinct bid from social_data.yelp where url='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + # print li1 + if len(li1) > 0: + item['bid'] = li1[0] + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url']) + + print item['bid'] + + + + + item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('ascii', 'ignore').strip()) + item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('ascii', 'ignore').strip()) + + if item['rv_count'] > 0: + + no = len(hxs.select('//div[@itemprop="review"]').extract()) + x=hxs.select('//div[@class="review-list"]/ul/li') + for i in range(no): + + self.__class__.count = self.__class__.count + 1 + try: + item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('ascii', 'ignore').strip() + except: + item['rv_date'] = 'NULL' + try: + item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'","").strip() + except: + item['rv_profile'] = 'NULL' + try: + item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('ascii', 'ignore').strip()) + except: + item['rv_rating'] = 'NULL' + try: + item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('ascii', 'ignore').split('lang="en">')[1].replace("
",'').replace('

','').replace("'"," ").replace('.','').strip() + except: + item['rv_dc'] = 'NULL' + + + # sql = ("insert into social_data.yelp select '%s','%s','%s,'%s','%s','%s','%s','%s' where not exists ( select * from social_data.yelp where rv_desc='%s' and rv_user='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile'])) + + sql = ("insert into social_data.yelp select '%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.yelp where rv_desc='%s' and rv_user='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + print item['url'],item['rv_profile'] + print self.__class__.count +# if item['rv_count'] > self.__class__.count: + + """Check the review which has more than 40""" + if item['rv_count'] > 40: +# nxt_link = hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%self.__class__.count + nxt_link = [] + + """ Parse the nex review link by multiple of 40""" + + + if float(item['rv_count'])/40 > item['rv_count']/40: + x= range(40,(item['rv_count']/40)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]) + else: + x= range(40,(item['rv_count']/40-1)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]) + print nxt_link + for i in range(len(nxt_link)): + if nxt_link[i]: + """ Next link Processed """ + yield Request( nxt_link[i], callback=self.parse) + + else: + + self.__class__.count = 0 + + """ Reviews below 40 for this business""" + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + + + + diff --git a/s b/s new file mode 100644 index 00000000..666ecfb9 --- /dev/null +++ b/s @@ -0,0 +1,395 @@ + + +

Generate a list: + +

+

Things you can do: + $thing"; + } + + unset($thing); + ?> + + + + + // echo "Oh, the humanity!"; + 5){ + echo "You get a 10% discount!"; + } + else + { + echo "dsdd";}; + ?> + + + + + + + + + + + $lang

"; + ?> + + // Echoes the first five even numbers + for ($i = 2; $i < 11; $i = $i + 2) { + echo $i; + + + + + H"; + } + else { + $headCount = 0; + echo "
T
"; + } + } + echo "

It took {$flipCount} flips!

"; + + + + $c=0; + while($c<4): + + echo "

Iteration number: {$c}

"; + $c++; + endwhile; + + + + ?> + + + + + H"; + } + else { + echo "
T
"; + } + } while ($flip); + $verb = "were"; + $last = "flips"; + if ($flipCount == 1) { + $verb = "was"; + $last = "flip"; + } + echo "

There {$verb} {$flipCount} {$last}!

"; + ?> + + $length = strlen("david"); + print $length; + + + +$d="anandhakumar"; +echo substr($d,0,5); + +echo strtoupper($d) + echo strtolower($d) + + $n="anandha"; + echo strpos($n,"a"); + + if (strpos($n,"g")==false) + { + echo "not there"; + } + print round(M_PI, 3); + $n="anandhakumar"; + + + + echo $n[rand(0,strlen($n))]; + + $a=array(); + array_push($a,"anand"); + + + $the_array=array(1,4,545,6,4,6,7,3); + sort($the_array); + print join(",", $the_array); + rsort($the_array); + print join(",",$the_array) + + + + + + + + + + firstname = $firstname; + $this->lastname = $lastname; + $this->age = $age; + } + + // Creating a method (function tied to an object) + + } + + // Creating a new person called "boring 12345", who is 12345 years old ;-) + $teacher = new Person('boring', '12345', 12345); + $student = new Person('aabording', 'a12345', 212345); + + echo $student->age; + + // Printing out, what the greet method returns + ?> + + + + + + +

+ honk(); + + + + ?> + + + + + +

+ name = $name; + } + + public function dance() { + return "I'm dancing!"; + } + } + + $me = new Person("Shane"); + if (is_a($me, "Person")) { + echo "I'm a person, "; + } + if (property_exists($me, "name")) { + echo "I have a name, "; + } + if (method_exists($me, "dance")) { + echo "and I know how to dance!"; + } + ?> + + +

+ 'with', + 'tomato' => 'without', + 'onions' => 'with'); + + // Looping through an array using "for". + // First, let's get the length of the array! + $length = count($food); + + // Remember, arrays in PHP are zero-based: + for ($i = 0; $i < $length; $i++) { + echo $food[$i] . '
'; + } + + echo '

I want my salad:
'; + + // Loop through an associative array using "foreach": + foreach ($salad as $ingredient=>$include) { + echo $include . ' ' . $ingredient . '
'; + } + + echo '

'; + + // Create your own array here and loop + // through it using foreach! + $me = array('hair' => 'black', + 'skin tone' => 'light'); + + foreach ($me as $k=>$v) + { + echo $v . ' '.$k; + } + + ?> + + + + +sudo -i -u postgres + + createdb mm + + psql +mm postgres template0 template1 +postgres@MMTPC104:~$ psql -d mm + + +select * from mmtweet; + create table mmtweet( id integer, username varchar(30), time timestamp, tweet text, r integer, f integer); +alter user postgres with PASSWORD 'mercuryminds'; + + + +insert into mmtweet values ('1','mm','2015-01-12 13:30:07 +mm'# ', 'DonĂ¢Â€Â™t shy away from #mcommerce . Try it out today itself... http://t.co/ZEQwKy6ydv http://t.co/IMTosu1KT3' +mm(# , 2, 1) \ No newline at end of file diff --git a/trip_spider.py b/trip_spider.py new file mode 100644 index 00000000..954528a7 --- /dev/null +++ b/trip_spider.py @@ -0,0 +1,315 @@ +" Enter Url here" + +url =['http://www.tripadvisor.com/Restaurant_Review-g42251-d4164754-Reviews-Taboon-Grand_Blanc_Michigan.html','http://www.tripadvisor.com/Restaurant_Review-g28970-d2137408-Reviews-Lincoln_DC-Washington_DC_District_of_Columbia.html','http://www.tripadvisor.com/Restaurant_Review-g29556-d416774-Reviews-Zingerman_s_Delicatessen-Ann_Arbor_Michigan.html','http://www.tripadvisor.com/Restaurant_Review-g58277-d3529482-Reviews-FireBrew-Virginia_Beach_Virginia.html'] + + +from scrapy.spider import BaseSpider +from scrapy.http import Request +from urlparse import urljoin +from scrapy.selector import HtmlXPathSelector +from trip.items import TripItem +import psycopg2 +from datetime import datetime + + +""" Database Creation""" +try: + con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") + cur = con.cursor() +except: + """ Database not Connected """ + +class YelpSpider(BaseSpider): + + + name = "tripss" + start_urls=[url[i] for i in range(len(url))] + + def parse(self, response): + + itm=[] + hxs = HtmlXPathSelector(response) + item = TripItem() + item['bid'] = 0 + item['url'] = response.url + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct u from social_data.tripadvisor1") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] + + + + +# print li + for i in range(len(li)): +# print li[i], item['url'] + try: + lis=li[i].split('-or')[1].split('-')[1] + except: + lis=li[i].split('Reviews-')[1].split('-')[0] + if item['url'].find(lis)>0: + qu1 = ("select distinct id from social_data.tripadvisor1 where u='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + # print li1 + if len(li1) > 0: + item['bid'] = li1[0] + qu2 = ("select max(d) from social_data.tripadvisor1 where id =%d")%item['bid'] + cur.execute(qu2) + rows2=cur.fetchall() + max_date = [i[0] for i in rows2][0] + print qu2,rows2,max_date + + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url']) + max_date = datetime.strptime('','') + print max_date + + print item['bid'] + + + + + item['rating'] = float(hxs.select('//div[@class="rs rating"]/span/img/@content').extract()[0].encode('ascii', 'ignore').strip()) + item['rv_count'] = int(hxs.select('//div[@class="rs rating"]/a/span/text()').extract()[0].encode('ascii', 'ignore').strip()) + + if item['rv_count'] > 0: + + no = hxs.select('//div[@class="review basic_review inlineReviewUpdate provider0"]').extract() + + for i in range(len(no)): + + + try: + xd=hxs.select('//span[@class="ratingDate"]').extract()[i].encode('utf-8').split('Reviewed')[1].split('\n')[0].replace(',','').replace('\n','').strip() + + item['rv_date'] = str(datetime.strptime(xd,'%B %d %Y')) + + + except: + item['rv_date'] = '0001-01-01 00:00:00' + current_date = datetime.strptime(item['rv_date'],'%Y-%m-%d %X') + + try: + item['rv_profile'] = hxs.select('//div[@class="username mo"]/span/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip() + except: + item['rv_profile'] = 'A TripAdvisor reviewer on Facebook' + try: + item['rv_heading'] = hxs.select('//span[@class="noQuotes"]/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip() + except: + item['rv_heading'] = 'NULL' + + try: + item['rv_rating'] = float(hxs.select('//div[@class="rating reviewItemInline"]/span/img/@alt').extract()[i].encode('ascii', 'ignore').split(' ')[0].strip()) + except: + item['rv_rating'] = '0.0' + try: + item['rv_dc'] = hxs.select('//div[@class="entry"]/p[@class="partial_entry"]').extract()[i].encode('ascii', 'ignore').split('\n')[1].replace("'"," ").replace('.','').strip() + except: + item['rv_dc'] = 'NULL' + last_date = current_date + + + """ It Only insert the new Feeds """ + print current_date, max_date + if current_date >= max_date: + + sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_heading'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + else: + break + print "No updated review are here" + +# print item['url'],item['rv_profile'] + + + """ Parse Next link""" + + + + try: + link = hxs.select('//div[@class="pgLinks"]/a[@class="guiArw sprite-pageNext "]/@href').extract()[0].encode('ascii', 'ignore').strip() + nxt_link = urljoin(response.url,link) + except: + nxt_link = [] + + + + print nxt_link + + + if nxt_link: + """ Next link Processed """ + + if last_date > max_date and '1900-01-01 00:00:00' not in str(max_date): + print "enter 1st" + yield Request( nxt_link, callback=self.parse) + elif '1900-01-01 00:00:00' in str(max_date): + print "enter 2nd" + yield Request( nxt_link, callback=self.parse_sub) + else: + print " Do nothing No other pages available" + else: + print " Progress Completed " + + + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + item['rv_heading'] = 'NULL' + + def parse_sub(self, response): + + print "Sub parse Called" + + itm=[] + hxs1 = HtmlXPathSelector(response) + item = TripItem() + item['bid'] = 0 + item['url'] = response.url + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct u from social_data.tripadvisor1") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] + + + +# print li + for i in range(len(li)): +# print li[i], item['url'] + try: + lis=li[i].split('-or')[1].split('-')[1] + except: + lis=li[i].split('Reviews-')[1].split('-')[0] + if item['url'].find(lis)>0: + qu1 = ("select distinct id from social_data.tripadvisor1 where u='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + # print li1 + if len(li1) > 0: + item['bid'] = li1[0] + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url']) + + + print item['bid'] + + + + + item['rating'] = float(hxs1.select('//div[@class="rs rating"]/span/img/@content').extract()[0].encode('ascii', 'ignore').strip()) + item['rv_count'] = int(hxs1.select('//div[@class="rs rating"]/a/span/text()').extract()[0].encode('ascii', 'ignore').strip()) + + if item['rv_count'] > 0: + + no = hxs1.select('//div[@class="review basic_review inlineReviewUpdate provider0"]').extract() + + for i in range(len(no)): + + + try: + xd=hxs1.select('//span[@class="ratingDate"]').extract()[i].encode('utf-8').split('Reviewed')[1].split('\n')[0].replace(',','').replace('\n','').strip() + + item['rv_date'] = str(datetime.strptime(xd,'%B %d %Y')) + + + except: + item['rv_date'] = '0001-01-01 00:00:00' + current_date = datetime.strptime(item['rv_date'],'%Y-%m-%d %X') + + try: + item['rv_profile'] = hxs1.select('//div[@class="username mo"]/span/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip() + except: + item['rv_profile'] = 'A TripAdvisor reviewer on Facebook' + try: + item['rv_heading'] = hxs1.select('//span[@class="noQuotes"]/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'"," ").strip() + except: + item['rv_heading'] = 'NULL' + + try: + item['rv_rating'] = float(hxs1.select('//div[@class="rating reviewItemInline"]/span/img/@alt').extract()[i].encode('ascii', 'ignore').split(' ')[0].strip()) + except: + item['rv_rating'] = '0.0' + try: + item['rv_dc'] = hxs1.select('//div[@class="entry"]/p[@class="partial_entry"]').extract()[i].encode('ascii', 'ignore').split('\n')[1].replace("'"," ").replace('.','').strip() + except: + item['rv_dc'] = 'NULL' + last_date = current_date + + + """ It Only insert the new Feeds """ + + + + sql = ("insert into social_data.tripadvisor1 select '%s','%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from social_data.tripadvisor1 where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_heading'],item['rv_rating'],item['rv_dc'],item['rv_profile'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + +# print item['url'],item['rv_profile'] + + + """ Parse Next link""" + + + + try: + link = hxs1.select('//div[@class="pgLinks"]/a[@class="guiArw sprite-pageNext "]/@href').extract()[0].encode('ascii', 'ignore').strip() + nxt_link = urljoin(response.url,link) + except: + nxt_link = [] + + + + print nxt_link + + + if nxt_link: + """ Next link Processed """ + yield Request( nxt_link, callback=self.parse_sub) + + else: + print " Progress Completed " + + + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + item['rv_heading'] = 'NULL' + + + + + +CREATE TABLE social_data.tripadvisor +( + bid integer, + url text, + rating double precision, + rv_count integer, + rv_date timestamp without time zone, + rv_heading text, + rv_rating double precision, + rv_desc text, + rv_user text +) diff --git a/tweet.py b/tweet.py new file mode 100644 index 00000000..0e3029ca --- /dev/null +++ b/tweet.py @@ -0,0 +1,171 @@ +import tweepy +import time +from datetime import datetime +import psycopg2 + + + +"""Un comment the below modules to get the real time tweets""" +# from tweepy.streaming import StreamListener +# from tweepy import Stream + + +# Consumer keys and access tokens, used for OAuth +consumer_key = 'MExVvZT0Q35926Crko5wFrGdr' +consumer_secret = 'YaK4JJ3w4xEHzE0DvOrQRuPsQjquIA5kqLB6i8McVkkRmaof53' +access_token = '2874668814-M5Vh0eC2u9Tmjk4GkO814bGksAQ57AgZ3pdXsB3' +access_token_secret = 'ean5rZki9KAwE3L5alxauTSMTyUFQYwG8enzSGNEKPxmm' + + +# OAuth process, using the keys and tokens +auth = tweepy.OAuthHandler(consumer_key, consumer_secret) +auth.set_access_token(access_token, access_token_secret) + +api = tweepy.API(auth) + + + +""" Database Creation""" +con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") +cur = con.cursor() + + + +"""Getting tweets from user timeline""" + +def tweet(twitter_profile, business_id, last_tweet_time): + + """Open file to save the data locally""" + fn = '%s' % twitter_profile+'_tweets.csv' + f = open(fn, 'w') + f.write('Business_ID\tName\tTime\tTweets\tRetweet_Count\tFav_count\n') + + # tw = tweepy.Cursor(api.user_timeline, id=twitter_profile).items() + tw = tweepy.Cursor(api.user_timeline, id=twitter_profile).items() + while True: + try: + c = tw.next() + user = c.user.screen_name.encode('utf-8') + tm = c.created_at + + data = c.text.encode('utf-8').replace("'", "").replace('\n', '').replace('"', '') + fc = c.favorite_count + rt = c.retweet_count + # try: + # old_tweet_time = datetime.strptime(last_tweet_time, '%Y-%m-%d %H:%M:%S') + # except: + # old_tweet_time = datetime.strptime('1', '%d') + + if tm > last_tweet_time: + print business_id, user, tm, data, fc, rt + + """Insert the tweet details into the csv file""" + + f.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (business_id, user, tm, data, fc, rt)) + + """Insert data into DB""" + + sql = ("insert into public.tweet(id, username, time, tweet, r, f) values('%s','%s','%s','%s','%s','%s')"%(business_id, user, tm, data, fc, rt)) + cur.execute(sql) + con.commit() + + elif tm == last_tweet_time: + print "No New Tweets Updated " + else: + break + + except tweepy.TweepError: + print "Got Exception Please wait for 15 Min to ReConnect or else the give profile is wrong" + time.sleep(61 * 15) + continue + except StopIteration: + break + + + + + +# +"""Un comment the below class to get the real time tweets""" +# class StdOutListener(StreamListener): +# ''' Handles data received from the stream. ''' +# def on_data(self, raw_data): +# print raw_data +# return True +# +# +# +# def on_error(self, status_code): +# print('Got an error with status code: ' + str(status_code)) +# return True # To continue listening +# +# if __name__ == '__main__': +# +# listener = StdOutListener() +# +# stream = Stream(auth, listener) + + + +"""Use the twitter Id to follow the realtime tweets follow=[' Id here']""" +# stream.filter(follow=['2874668814'],track=[]) + + +"""Checkdb function is to check the data into our DB""" + +def checkdb(tname): + + """Get the Unique user name from the DB""" + sql = ("select distinct username from public.tweet") + cur.execute(sql) + rows = cur.fetchall() + + de = [] + li = [] + for r in rows: + de.append(r[0].lower()) + li.append(r[0]) +# """ de list has list of user names in our DB """ + print de + print li + + if tname.lower() in de: + print "Previoius Data Found for the given Twitter Profile name" + j = de.index(tname.lower()) + cur.execute("select distinct id from public.tweet where username='%s'" % li[j]) + r = cur.fetchall() + b_id = r[0][0] + cur.execute("select max(time) from public.tweet where id='%s'" % b_id) + r = cur.fetchall() + ti = r[0][0] + """Get the business_id and max(time) !""" + print b_id, ti + """Call the tweet function with username and business_id and time from the DB""" + tweet(twitter_profile_name, b_id, ti) + else: + """No Previous Data in DB so Take all tweets""" + alltweets() + + """Below If works if the DB is empty""" + if len(de) == 0: + alltweets() + + + +"""Get all tweets by calling alltweet function""" + +def alltweets(): + print "No previous data for given profile\n" + b_id = input("Enter Business Id") + ti = datetime.strptime('1', '%d') + tweet(twitter_profile_name, b_id, ti) + + + +"""Enter the twitter Profile """ +twitter_profile_name = 'rparthiepan' + +checkdb(twitter_profile_name) + +"""DB connection close""" +con.close() \ No newline at end of file diff --git a/ur.py b/ur.py new file mode 100644 index 00000000..e269b90e --- /dev/null +++ b/ur.py @@ -0,0 +1,235 @@ +# _print " Enter Url here" +# url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/taboon-grand-blanc-grand-blanc-township','http://www.yelp.com/biz/lincoln-washington','http://www.yelp.com/biz/zingermans-delicatessen-ann-arbor-2'] +# url =['http://www.urbanspoon.com/r/1/4524/restaurant/Capitol-Hill/Honey-Hole-Sandwiches-Seattle','http://www.urbanspoon.com/r/1/4524/restaurant/Capitol-Hill/Honey-Hole-Sandwiches-Seattle','http://www.urbanspoon.com/r/13/169913/restaurant/North-Richland-Hills-Richland-Hills/Texs-Star-Grill-Watauga'] +# +# from scrapy.spider import BaseSpider +# from scrapy.http import Request +# from urlparse import urljoin +# from scrapy.selector import HtmlXPathSelector +# from urbans.items import UrbansItem +import psycopg2 +# import time +# +# +# """ Database Creation""" +import datetime +con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") +cur = con.cursor() +bid=12 +cur.execute("select max(d) from y where id='%s'" %bid) +r = cur.fetchall() +i = r[0][0] +x='2015-01-25' + +try: + ff = datetime.datetime.strptime(x, '%Y-%m-%d') +except: + ff = datetime.datetime.strptime('1', '%d') +try: + if ff >= i: + print "ok" + else: + print i,ff +except: + print "do " +#print r,'\n',type(i),i,len(i) +# +# class UrbansSpider(BaseSpider): +# +# +# name = "urban" +# start_urls=[url[i] for i in range(len(url))] +# f=open('opt.csv','w') +# +# def parse(self, response): +# +# itm=[] +# hxs = HtmlXPathSelector(response) +# item = UrbansItem() +# +# item['bid'] = 1 +# item['u'] = response.url +# item['li'] = hxs.select('//div[@class="rating"]/text()').extract()[0].encode('utf-8') +# item['nv'] = hxs.select('//div[@class="stats"]/div/text()').extract()[0].encode('utf-8').replace('\n','') +# item['nr'] = hxs.select('//a[@data-ga-action="reviews"]/text()').extract()[0].encode('utf-8').split(' ')[0] +# x= 'http://www.urbanspoon.com'+hxs.select('//div[@data-ga-action="diner-reviews"]/@data-url').extract()[0].encode('utf-8') +# if x: +# yield Request(x, callback=self.parse_sub, meta={'item':item}) +# +# def parse_sub(self, response): +# +# item = UrbansItem(response.meta['item']) +# +# hxs = HtmlXPathSelector(response) +# +# for i in range(20): +# o=hxs.select('//div[@class="title"]/text()').extract()[i].encode('utf-8').replace("'","") +# u= hxs.select('//a[@data-ga-action="user-profile-page"]/text()').extract()[i].encode('utf-8') +# e=hxs.select('//time[@class="posted-on"]/text()').extract()[i].encode('utf-8').split(' ')[2].replace('\n','') +# e1=time.strftime(e) +# print type(e1) +# +# item['rd'] = hxs.select('//div[@itemprop="description"]/text()').extract()[i].encode('utf-8').replace('\n','').replace("'","").strip() +# sql = ("insert into public.ep select %s,'%s','%s',%s,%s,'%s','%s','%s','%s' where not exists ( select * from public.ep where dc='%s' and us='%s')"%(item['bid'],item['u'],item['nv'],item['li'],item['nr'],e1,o,item['rd'],u,item['rd'],u)) +# cur.execute(sql) +# con.commit() +# +# +# +# +# +# +# +# +# Sprites +# +# +# +# +#

+# +#
+#
+# +#
+#
+# +#
+# +# +# +# +# +# +# +# +# +# " Enter Url here" +# +# url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/sabra-design-washington-3?osq=web+design+companies'] +# +# from scrapy.spider import BaseSpider +# from scrapy.http import Request +# from urlparse import urljoin +# from scrapy.selector import HtmlXPathSelector +# from urbans.items import UrbansItem +# import psycopg2 +# import time +# +# +# """ Database Creation""" +# con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") +# cur = con.cursor() +# cur.close() +# class UrbansSpider(BaseSpider): +# +# +# name = "ylp" +# start_urls=[url[i] for i in range(len(url))] +# f=open('opt.csv','w') +# count = 0 +# def parse(self, response): +# +# itm=[] +# hxs = HtmlXPathSelector(response) +# item = UrbansItem() +# item['bid'] = 0 +# item['url'] = response.url +# qu = ("select distinct u from ssdd") +# cur.execute(qu) +# rows = cur.fetchall() +# li = [r[0] for r in rows] +# print li +# for i in range(len(li)): +# print li[i], item['url'] +# if li[i] in item['url']: +# qu1 = ("select distinct id from ssdd where u='%s'")%li[i] +# cur.execute(qu1) +# cur +# rows1= cur.fetchall() +# li1=[i[0] for i in rows1] +# print li1 +# if len(li1) > 0: +# item['bid'] = li1[0] +# if item['bid'] > 0: +# print " ID assigned" +# else: +# item['bid'] = input("Enter the Business Id here") +# +# +# print item['bid'] +# item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('utf-8').strip()) +# item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('utf-8').strip()) +# +# if item['rv_count'] > 0: +# +# no = len(hxs.select('//div[@itemprop="review"]').extract()) +# x=hxs.select('//div[@class="review-list"]/ul/li') +# for i in range(no): +# +# self.__class__.count = self.__class__.count + 1 +# try: +# item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('utf-8').strip() +# except: +# item['rv_date'] = 'NULL' +# try: +# item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('utf-8').strip() +# except: +# item['rv_profile'] = 'NULL' +# try: +# item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('utf-8').strip()) +# except: +# item['rv_rating'] = 'NULL' +# try: +# item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('utf-8').split('lang="en">')[1].replace("
",'').replace('

','').replace('\xc2','').replace('\xa0','').replace("'"," ").strip() +# except: +# item['rv_dc'] = 'NULL' +# +# sql = ("insert into public.ssdd select %s,'%s',%s,%s,'%s','%s','%s',%s where not exists ( select * from public.ssdd where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_profile'],item['rv_dc'],item['rv_rating'],item['rv_dc'],item['rv_profile'])) +# +# cur.execute(sql) +# con.commit() +# +# print self.__class__.count +# if item['rv_count'] > self.__class__.count: +# +# nxt_link = hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('utf-8').strip() +# +# if nxt_link: +# """ Next link Processed """ +# yield Request( nxt_link, callback=self.parse) +# +# else: +# self.__class__.count = 0 +# """ Reviews below 40 for this business""" +# +# else: +# item['rv_date'] = 'NULL' +# item['rv_profile'] = 'NULL' +# item['rv_rating'] = 'NULL' +# item['rv_dc'] = 'NULL' +# +# +# +# +# diff --git a/urban.py b/urban.py new file mode 100644 index 00000000..959d1a46 --- /dev/null +++ b/urban.py @@ -0,0 +1,108 @@ +" Enter Url here" + +url =['http://www.yelp.com/biz/firebrew-virginia-beach','http://www.yelp.com/biz/sabra-design-washington-3?osq=web+design+companies'] + + +from scrapy.spider import BaseSpider +from scrapy.http import Request +from urlparse import urljoin +from scrapy.selector import HtmlXPathSelector +from urbans.items import UrbansItem +import psycopg2 + + + +""" Database Creation""" +con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") +cur = con.cursor() + +class UrbansSpider(BaseSpider): + + + name = "ylp" + start_urls=[url[i] for i in range(len(url))] + f=open('opt.csv','w') + count = 0 + def parse(self, response): + + itm=[] + hxs = HtmlXPathSelector(response) + item = UrbansItem() + item['bid'] = 0 + item['url'] = response.url + qu = ("select distinct u from ssdd") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] + print li + for i in range(len(li)): + print li[i], item['url'] + if li[i] in item['url']: + qu1 = ("select distinct id from ssdd where u='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + print li1 + if len(li1) > 0: + item['bid'] = li1[0] + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here") + + + print item['bid'] + item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('utf-8').strip()) + item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('utf-8').strip()) + + if item['rv_count'] > 0: + + no = len(hxs.select('//div[@itemprop="review"]').extract()) + x=hxs.select('//div[@class="review-list"]/ul/li') + for i in range(no): + + self.__class__.count = self.__class__.count + 1 + try: + item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('utf-8').strip() + except: + item['rv_date'] = 'NULL' + try: + item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('utf-8').strip() + except: + item['rv_profile'] = 'NULL' + try: + item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('utf-8').strip()) + except: + item['rv_rating'] = 'NULL' + try: + item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('utf-8').split('lang="en">')[1].replace("
",'').replace('

','').replace('\xc2','').replace('\xa0','').replace("'"," ").strip() + except: + item['rv_dc'] = 'NULL' + + sql = ("insert into public.ssdd select %s,'%s',%s,%s,'%s','%s','%s',%s where not exists ( select * from public.ssdd where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_profile'],item['rv_dc'],item['rv_rating'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + + print self.__class__.count + if item['rv_count'] > self.__class__.count: + + nxt_link = hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('utf-8').strip() + + if nxt_link: + """ Next link Processed """ + yield Request( nxt_link, callback=self.parse) + + else: + self.__class__.count = 0 + """ Reviews below 40 for this business""" + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + + + + diff --git a/yelp_spider.py b/yelp_spider.py new file mode 100644 index 00000000..2f57ed86 --- /dev/null +++ b/yelp_spider.py @@ -0,0 +1,308 @@ +" Enter Url here" + +""" Before enter the URL please check when the sort by date is performed the url is appends only the date params """ + +""" If not please edit the url and enter in below list""" + + +url =['http://www.yelp.com/biz/the-face-shop-santa-clara','http://www.yelp.com/biz/trader-joes-mountain-view'] + +from scrapy.spider import BaseSpider +from scrapy.http import Request +from urlparse import urljoin +from scrapy.selector import HtmlXPathSelector +from yelp.items import YelpItem +import psycopg2 +from datetime import datetime + + +""" Database Creation""" +try: + con = psycopg2.connect(database="mm", user="postgres", password="mercuryminds", host="localhost") + cur = con.cursor() +except: + """ Database not Connected """ + +class YelpSpider(BaseSpider): + + + name = "ylpp" + start_urls=[url[i]+'?sort_by=date_desc' for i in range(len(url))] +# f=open('opt.csv','w') + + def parse(self, response): + + itm=[] + hxs = HtmlXPathSelector(response) + item = YelpItem() + item['bid'] = 0 + item['url'] = response.url + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct u from y") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] +# print li + for i in range(len(li)): + # print li[i], item['url'] + if li[i].split('?')[0] in item['url']: + qu1 = ("select distinct id from y where u='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + # print li1 + if len(li1) > 0: + item['bid'] = li1[0] + """ Maximum Date has been verified from the DB through bid """ + + qu2 = ("select max(d) from y where id =%d")%item['bid'] + cur.execute(qu2) + rows2=cur.fetchall() + max_date = [i[0] for i in rows2][0] + +# print max_date + + + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url']) + max_date = datetime.strptime('','') + print max_date + + print item['bid'] + + + + + item['rating'] = float(hxs.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('ascii', 'ignore').strip()) + item['rv_count'] = int(hxs.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('ascii', 'ignore').strip()) + + if item['rv_count'] > 0: + + no = len(hxs.select('//div[@itemprop="review"]').extract()) + x=hxs.select('//div[@class="review-list"]/ul/li') + for i in range(no): + + + try: + item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('ascii', 'ignore').strip() + except: + item['rv_date'] = 'NULL' + + current_date = datetime.strptime(item['rv_date'],'%Y-%m-%d') + + try: + item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'","").strip() + except: + item['rv_profile'] = 'NULL' + try: + item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('ascii', 'ignore').strip()) + except: + item['rv_rating'] = 'NULL' + try: + item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('ascii', 'ignore').split('lang="en">')[1].replace("
",'').replace('

','').replace("'"," ").replace('.','').strip() + except: + item['rv_dc'] = 'NULL' +# print current_date,max_date + last_date = current_date + + + """ It Only insert the new Feeds """ + + if current_date >= max_date: + + sql = ("insert into y select '%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from y where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_profile'],item['rv_dc'],item['rv_rating'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + + else: + break + print "No updated review are here" + + + print item['url'],item['rv_profile'] + + + + + """Check the New reviews is in next page and it will not include newly entered url """ + + if item['rv_count'] > 40 and last_date > max_date and '1900-01-01 00:00:00' not in str(max_date): + + nxt_link = [] + + """ Parse the nex review link by multiple of 40""" + + + if float(item['rv_count'])/40 > item['rv_count']/40: + x= range(40,(item['rv_count']/40)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + else: + x= range(40,(item['rv_count']/40-1)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + + print nxt_link + + for i in range(len(nxt_link)): + if nxt_link[i]: + """ Next link Processed """ + yield Request( nxt_link[i], callback=self.parse) + + + elif item['rv_count'] > 40 and '1900-01-01 00:00:00' in str(max_date): + + nxt_link = [] + + """ Parse the nex review link by multiple of 40""" + + """Check the reviews is in next page it works only for the new url """ + + if float(item['rv_count'])/40 > item['rv_count']/40: + x= range(40,(item['rv_count']/40)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + else: + x= range(40,(item['rv_count']/40-1)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + + print nxt_link + + for i in range(len(nxt_link)): + if nxt_link[i]: + """ Next link Processed """ + yield Request( nxt_link[i], callback=self.parse_sub) + + else: + + print "Do nothing" + + """ Reviews below 40 for this business""" + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + + """ Parse_sub function is only for the URL which is running first time alone""" + + def parse_sub(self, response): + + + print "Sub parse Called " + itm=[] + hxs1 = HtmlXPathSelector(response) + item = YelpItem() + item['bid'] = 0 + item['url'] = response.url + + """Getting the Business Id from the DB if exists """ + + qu = ("select distinct u from y") + cur.execute(qu) + rows = cur.fetchall() + li = [r[0] for r in rows] +# print li + for i in range(len(li)): + # print li[i], item['url'] + if li[i].split('?')[0] in item['url']: + qu1 = ("select distinct id from y where u='%s'")%li[i] + cur.execute(qu1) + rows1= cur.fetchall() + li1=[i[0] for i in rows1] + # print li1 + if len(li1) > 0: + item['bid'] = li1[0] + + + if item['bid'] > 0: + print " ID assigned" + else: + item['bid'] = input("Enter the Business Id here for URL: %s : "%item['url']) + + print item['bid'] + + + + + item['rating'] = float(hxs1.select('//div[@itemprop="aggregateRating"]/div/meta/@content').extract()[0].encode('ascii', 'ignore').strip()) + item['rv_count'] = int(hxs1.select('//span[@itemprop="reviewCount"]/text()').extract()[0].encode('ascii', 'ignore').strip()) + + if item['rv_count'] > 0: + + no = len(hxs1.select('//div[@itemprop="review"]').extract()) + x=hxs1.select('//div[@class="review-list"]/ul/li') + for i in range(no): + + + try: + item['rv_date'] = x[0].select('//meta[@itemprop="datePublished"]/@content').extract()[i].encode('ascii', 'ignore').strip() + except: + item['rv_date'] = 'NULL' + + + + try: + item['rv_profile'] = x[0].select('//li[@class="user-name"]/a/text()').extract()[i].encode('ascii', 'ignore').replace('.','').replace("'","").strip() + except: + item['rv_profile'] = 'NULL' + try: + item['rv_rating'] = float(x[0].select('//div[@itemprop="reviewRating"]/div/meta/@content').extract()[i].encode('ascii', 'ignore').strip()) + except: + item['rv_rating'] = 'NULL' + try: + item['rv_dc'] = x[0].select('//div[@class="review-content"]/p').extract()[i].encode('ascii', 'ignore').split('lang="en">')[1].replace("
",'').replace('

','').replace("'"," ").replace('.','').strip() + except: + item['rv_dc'] = 'NULL' + + + sql = ("insert into y select '%s','%s','%s','%s','%s','%s','%s','%s' where not exists ( select * from y where dc='%s' and p='%s')"%(item['bid'],item['url'],item['rating'],item['rv_count'],item['rv_date'],item['rv_profile'],item['rv_dc'],item['rv_rating'],item['rv_dc'],item['rv_profile'])) + + cur.execute(sql) + con.commit() + + + + # print item['url'],item['rv_profile'] + + + if item['rv_count'] > 40: + nxt_link = [] + + """ Parse the nex review link by multiple of 40""" + + + if float(item['rv_count'])/40 > item['rv_count']/40: + x= range(40,(item['rv_count']/40)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs1.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + else: + x= range(40,(item['rv_count']/40-1)*40+1,40) + for i in range(len(x)): + nxt_link.append(hxs1.select('//a[@class="page-option prev-next"]/@href').extract()[0].encode('ascii', 'ignore').split('?')[0]+'?start=%s'%x[i]+'&sort_by=date_desc') + print nxt_link + for i in range(len(nxt_link)): + if nxt_link[i]: + """ Next link Processed """ + yield Request( nxt_link[i], callback=self.parse_sub) + + else: + + print "Do nothing" + + """ Reviews below 40 for this business""" + + else: + item['rv_date'] = 'NULL' + item['rv_profile'] = 'NULL' + item['rv_rating'] = 'NULL' + item['rv_dc'] = 'NULL' + +