Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,13 +3,11 @@ import gradio as gr
|
|
| 3 |
from PIL import Image
|
| 4 |
import requests
|
| 5 |
|
| 6 |
-
|
| 7 |
from transformers import ViTFeatureExtractor
|
| 8 |
feature_extractor = ViTFeatureExtractor()
|
| 9 |
# or, to load one that corresponds to a checkpoint on the hub:
|
| 10 |
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
|
| 11 |
|
| 12 |
-
|
| 13 |
from transformers import VisionEncoderDecoderModel
|
| 14 |
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
|
| 15 |
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
|
@@ -28,15 +26,6 @@ tokenizer = AutoTokenizer.from_pretrained(repo_name)
|
|
| 28 |
model = VisionEncoderDecoderModel.from_pretrained(repo_name)
|
| 29 |
|
| 30 |
def get_quote(image):
|
| 31 |
-
|
| 32 |
-
#image = Image.open(image_1).raw
|
| 33 |
-
#image = Image.open(image_1)
|
| 34 |
-
|
| 35 |
-
#url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 36 |
-
#with Image.open(requests.get(url, stream=True).raw) as image:
|
| 37 |
-
|
| 38 |
-
#image.save("cats.png")
|
| 39 |
-
|
| 40 |
|
| 41 |
##############
|
| 42 |
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
|
|
@@ -47,18 +36,12 @@ def get_quote(image):
|
|
| 47 |
# decode into text
|
| 48 |
preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
|
| 49 |
preds = [pred.strip() for pred in preds]
|
| 50 |
-
#print(preds)
|
| 51 |
-
|
| 52 |
return preds
|
| 53 |
|
| 54 |
-
|
| 55 |
#1: Text to Speech
|
| 56 |
-
|
| 57 |
-
title = "Image to text generation"
|
| 58 |
|
| 59 |
-
demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "
|
| 60 |
-
#inputs = "image"
|
| 61 |
-
#inputs=gr.inputs.Image(type="pil")
|
| 62 |
if __name__ == "__main__":
|
| 63 |
|
| 64 |
demo.launch(debug=True, cache_examples=True)
|
|
|
|
| 3 |
from PIL import Image
|
| 4 |
import requests
|
| 5 |
|
|
|
|
| 6 |
from transformers import ViTFeatureExtractor
|
| 7 |
feature_extractor = ViTFeatureExtractor()
|
| 8 |
# or, to load one that corresponds to a checkpoint on the hub:
|
| 9 |
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
|
| 10 |
|
|
|
|
| 11 |
from transformers import VisionEncoderDecoderModel
|
| 12 |
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
|
| 13 |
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
|
|
|
|
| 26 |
model = VisionEncoderDecoderModel.from_pretrained(repo_name)
|
| 27 |
|
| 28 |
def get_quote(image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
##############
|
| 31 |
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
|
|
|
|
| 36 |
# decode into text
|
| 37 |
preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
|
| 38 |
preds = [pred.strip() for pred in preds]
|
|
|
|
|
|
|
| 39 |
return preds
|
| 40 |
|
|
|
|
| 41 |
#1: Text to Speech
|
| 42 |
+
title = "Get a sentence with items, present in the image"
|
|
|
|
| 43 |
|
| 44 |
+
demo = gr.Interface(fn=get_quote, inputs=gr.inputs.Image(type="pil"), outputs=['text'],title = title, description = "Upload an image file and get text from it" ,cache_examples=False, enable_queue=True).launch()
|
|
|
|
|
|
|
| 45 |
if __name__ == "__main__":
|
| 46 |
|
| 47 |
demo.launch(debug=True, cache_examples=True)
|